[llvm] Move ExpandMemCmp and MergeIcmp to the middle end (PR #77370)

Gabriel Baraldi via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 8 12:18:24 PST 2024


https://github.com/gbaraldi created https://github.com/llvm/llvm-project/pull/77370

This should allow for optimizations like saving repeated loads between memcmp calls. c.f https://godbolt.org/z/bEna4Md9r, the window on the right shows the generated output of this branch.

One question is where to put this in the pipeline. The inline expansions probably benefit the most from the passes we run early, but we might want to wait a bit so that we can try and prove a constant argument to the memcmp (though I imagine  if they are going to be constant, it's directly from the source).

One other question is that some of the x86 tests are massive, I just ported over from the `llc` test, but they grew quite a bit more.

>From 4440a91823cf878ea0dec29fb7d511a25f4333c0 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Mon, 8 Jan 2024 17:04:08 -0300
Subject: [PATCH] Move ExpandMemCmp and MergeIcmp to the middle end to allow
 for better optimization of the inline expansions

---
 .../include/llvm/CodeGen/CodeGenPassBuilder.h |    10 -
 .../llvm/CodeGen/MachinePassRegistry.def      |     2 -
 llvm/include/llvm/CodeGen/Passes.h            |     2 -
 llvm/include/llvm/InitializePasses.h          |     1 -
 llvm/include/llvm/LinkAllPasses.h             |     1 -
 .../Scalar}/ExpandMemCmp.h                    |     6 +-
 llvm/lib/CodeGen/CMakeLists.txt               |     1 -
 llvm/lib/CodeGen/CodeGen.cpp                  |     1 -
 llvm/lib/CodeGen/TargetPassConfig.cpp         |    11 -
 llvm/lib/Passes/PassBuilder.cpp               |     2 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp      |     6 +
 llvm/lib/Passes/PassRegistry.def              |     3 +-
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |     1 +
 .../Scalar}/ExpandMemCmp.cpp                  |   133 +-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |     7 -
 .../test/CodeGen/AArch64/bcmp-inline-small.ll |    98 -
 llvm/test/CodeGen/AArch64/bcmp.ll             |   537 -
 .../test/CodeGen/AArch64/dag-combine-setcc.ll |    31 +-
 .../AArch64/machine-licm-hoist-load.ll        |   128 +-
 llvm/test/CodeGen/AArch64/memcmp.ll           |  3029 ---
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |    28 -
 llvm/test/CodeGen/ARM/O3-pipeline.ll          |     7 -
 llvm/test/CodeGen/BPF/memcmp.ll               |    77 -
 llvm/test/CodeGen/Generic/llc-start-stop.ll   |     6 +-
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll   |     9 +-
 llvm/test/CodeGen/M68k/pipeline.ll            |     7 -
 llvm/test/CodeGen/PowerPC/O3-pipeline.ll      |     9 +-
 .../memCmpUsedInZeroEqualityComparison.ll     |   168 -
 .../CodeGen/PowerPC/memcmp-mergeexpand.ll     |    39 -
 llvm/test/CodeGen/PowerPC/memcmp.ll           |    62 -
 llvm/test/CodeGen/PowerPC/memcmpIR.ll         |   178 -
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |     9 +-
 llvm/test/CodeGen/X86/memcmp-mergeexpand.ll   |    49 -
 llvm/test/CodeGen/X86/memcmp-minsize-x32.ll   |   445 -
 llvm/test/CodeGen/X86/memcmp-minsize.ll       |   433 -
 .../CodeGen/X86/memcmp-more-load-pairs-x32.ll |  2911 ---
 .../CodeGen/X86/memcmp-more-load-pairs.ll     |  4006 ---
 llvm/test/CodeGen/X86/memcmp-optsize-x32.ll   |   583 -
 llvm/test/CodeGen/X86/memcmp-optsize.ll       |   596 -
 llvm/test/CodeGen/X86/memcmp-pgso-x32.ll      |   600 -
 llvm/test/CodeGen/X86/memcmp-pgso.ll          |   613 -
 llvm/test/CodeGen/X86/memcmp-x32.ll           |  2429 --
 llvm/test/CodeGen/X86/memcmp.ll               |  3065 ---
 llvm/test/CodeGen/X86/opt-pipeline.ll         |     9 +-
 llvm/test/Other/new-pm-defaults.ll            |     4 +-
 .../Other/new-pm-thinlto-postlink-defaults.ll |     4 +-
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |     4 +-
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |     4 +-
 .../Other/new-pm-thinlto-prelink-defaults.ll  |     4 +-
 .../new-pm-thinlto-prelink-pgo-defaults.ll    |    26 +-
 ...w-pm-thinlto-prelink-samplepgo-defaults.ll |     4 +-
 .../Transforms/ExpandMemCmp/AArch64/bcmp.ll   |   751 +
 .../ExpandMemCmp/AArch64/memcmp-extra.ll      |  3434 +++
 .../Transforms/ExpandMemCmp/AArch64/memcmp.ll |     1 -
 .../Transforms/ExpandMemCmp/BPF/lit.local.cfg |     4 +
 .../Transforms/ExpandMemCmp/BPF/memcmp.ll     |   119 +
 .../ExpandMemCmp/PowerPC/lit.local.cfg        |     2 +
 .../memCmpUsedInZeroEqualityComparison.ll     |   218 +
 .../PowerPC/memcmp-mergeexpand.ll             |    48 +
 .../Transforms/ExpandMemCmp/PowerPC/memcmp.ll |    70 +
 .../ExpandMemCmp/PowerPC/memcmpIR.ll          |   216 +
 llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll |    16 +-
 .../Transforms/ExpandMemCmp/X86/memcmp-2.ll   | 20249 ++++++++++++++++
 .../ExpandMemCmp}/X86/memcmp-constant.ll      |    89 +-
 .../ExpandMemCmp/X86/memcmp-minsize-x32.ll    |   493 +
 .../ExpandMemCmp/X86/memcmp-minsize.ll        |   707 +
 .../X86/memcmp-more-load-pairs-x32.ll         |  6203 +++++
 .../X86/memcmp-more-load-pairs.ll             | 18833 ++++++++++++++
 .../ExpandMemCmp/X86/memcmp-nobuiltin.ll      |   248 +
 .../ExpandMemCmp/X86/memcmp-optsize-x32.ll    |   870 +
 .../ExpandMemCmp/X86/memcmp-optsize.ll        |  1414 ++
 .../ExpandMemCmp/X86/memcmp-pgso-x32.ll       |   887 +
 .../ExpandMemCmp/X86/memcmp-pgso.ll           |  1347 +
 .../ExpandMemCmp/X86/memcmp-x32-2.ll          |  4813 ++++
 .../Transforms/ExpandMemCmp/X86/memcmp-x32.ll |   523 +-
 .../Transforms/ExpandMemCmp/X86/memcmp.ll     |  1194 +-
 .../PhaseOrdering/PowerPC/lit.local.cfg       |     2 +
 .../PhaseOrdering/X86/memcmp-early.ll         |    86 +
 .../PhaseOrdering/X86/memcmp-mergeexpand.ll   |    62 +
 .../Transforms/PhaseOrdering/X86/memcmp.ll    |   856 +
 llvm/tools/opt/opt.cpp                        |     1 -
 .../gn/secondary/llvm/lib/CodeGen/BUILD.gn    |     1 -
 .../llvm/lib/Transforms/Scalar/BUILD.gn       |     1 +
 83 files changed, 63081 insertions(+), 21075 deletions(-)
 rename llvm/include/llvm/{CodeGen => Transforms/Scalar}/ExpandMemCmp.h (83%)
 rename llvm/lib/{CodeGen => Transforms/Scalar}/ExpandMemCmp.cpp (90%)
 delete mode 100644 llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/bcmp.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/memcmp.ll
 delete mode 100644 llvm/test/CodeGen/BPF/memcmp.ll
 delete mode 100644 llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
 delete mode 100644 llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll
 delete mode 100644 llvm/test/CodeGen/PowerPC/memcmp.ll
 delete mode 100644 llvm/test/CodeGen/PowerPC/memcmpIR.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-minsize.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-optsize.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-pgso.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/AArch64/bcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp-extra.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/BPF/lit.local.cfg
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/BPF/memcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/lit.local.cfg
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/memCmpUsedInZeroEqualityComparison.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmpIR.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-2.ll
 rename llvm/test/{CodeGen => Transforms/ExpandMemCmp}/X86/memcmp-constant.ll (50%)
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize-x32.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs-x32.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-nobuiltin.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize-x32.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso-x32.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32-2.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/memcmp-mergeexpand.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll

diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
index a7cbb0910baabf..556304231b397b 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
@@ -25,7 +25,6 @@
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/CodeGen/CallBrPrepare.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
-#include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/ExpandReductions.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
@@ -629,15 +628,6 @@ void CodeGenPassBuilder<Derived>::addIRPasses(AddIRPass &addPass) const {
       addPass(PrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
-  if (getOptLevel() != CodeGenOptLevel::None) {
-    // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
-    // loads and compares. ExpandMemCmpPass then tries to expand those calls
-    // into optimally-sized loads and compares. The transforms are enabled by a
-    // target lowering hook.
-    if (!Opt.DisableMergeICmps)
-      addPass(MergeICmpsPass());
-    addPass(ExpandMemCmpPass(&TM));
-  }
 
   // Run GC lowering passes for builtin collectors
   // TODO: add a pass insertion point here
diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
index f950dfae7e338b..3c00668aae3897 100644
--- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def
+++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@@ -47,7 +47,6 @@ FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass, (TM))
 FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass, (false))
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass, (TM))
 FUNCTION_PASS("expand-large-fp-convert", ExpandLargeFpConvertPass, (TM))
-FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass, (TM))
 FUNCTION_PASS("expand-reductions", ExpandReductionsPass, ())
 FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ())
 FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass, (TM))
@@ -55,7 +54,6 @@ FUNCTION_PASS("interleaved-access", InterleavedAccessPass, (TM))
 FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass, (TM))
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass, ())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass, ())
-FUNCTION_PASS("mergeicmps", MergeICmpsPass, ())
 FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass, ())
 FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass, (true))
 FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib, ())
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index ca9fbb1def7624..e5ed5f15f62ed7 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -519,8 +519,6 @@ namespace llvm {
   // Expands large div/rem instructions.
   FunctionPass *createExpandLargeFpConvertPass();
 
-  // This pass expands memcmp() to load/stores.
-  FunctionPass *createExpandMemCmpLegacyPass();
 
   /// Creates Break False Dependencies pass. \see BreakFalseDeps.cpp
   FunctionPass *createBreakFalseDeps();
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 46b1e95c3c15f3..b0ca9fa942cda3 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -103,7 +103,6 @@ void initializeEdgeBundlesPass(PassRegistry&);
 void initializeEHContGuardCatchretPass(PassRegistry &);
 void initializeExpandLargeFpConvertLegacyPassPass(PassRegistry&);
 void initializeExpandLargeDivRemLegacyPassPass(PassRegistry&);
-void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExpandReductionsPass(PassRegistry&);
 void initializeExpandVectorPredicationPass(PassRegistry &);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 7a21876e565a7c..9aff428fbe938b 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -119,7 +119,6 @@ namespace {
       (void) llvm::createPostDomTree();
       (void) llvm::createMergeICmpsLegacyPass();
       (void) llvm::createExpandLargeDivRemPass();
-      (void)llvm::createExpandMemCmpLegacyPass();
       (void) llvm::createExpandVectorPredicationPass();
       std::string buf;
       llvm::raw_string_ostream os(buf);
diff --git a/llvm/include/llvm/CodeGen/ExpandMemCmp.h b/llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h
similarity index 83%
rename from llvm/include/llvm/CodeGen/ExpandMemCmp.h
rename to llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h
index 94a877854f327a..94ba0cf9305040 100644
--- a/llvm/include/llvm/CodeGen/ExpandMemCmp.h
+++ b/llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_EXPANDMEMCMP_H
-#define LLVM_CODEGEN_EXPANDMEMCMP_H
+#ifndef LLVM_TRANSFORMS_SCALAR_EXPANDMEMCMP_H
+#define LLVM_TRANSFORMS_SCALAR_EXPANDMEMCMP_H
 
 #include "llvm/IR/PassManager.h"
 
@@ -26,4 +26,4 @@ class ExpandMemCmpPass : public PassInfoMixin<ExpandMemCmpPass> {
 
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_EXPANDMEMCMP_H
+#endif // LLVM_TRANSFORMS_SCALAR_EXPANDMEMCMP_H
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index df2d1831ee5fdb..518432e9a7b32f 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -71,7 +71,6 @@ add_llvm_component_library(LLVMCodeGen
   ExecutionDomainFix.cpp
   ExpandLargeDivRem.cpp
   ExpandLargeFpConvert.cpp
-  ExpandMemCmp.cpp
   ExpandPostRAPseudos.cpp
   ExpandReductions.cpp
   ExpandVectorPredication.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 7b73a7b11ddf1c..043fa4e6eabe8f 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -41,7 +41,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeEarlyTailDuplicatePass(Registry);
   initializeExpandLargeDivRemLegacyPassPass(Registry);
   initializeExpandLargeFpConvertLegacyPassPass(Registry);
-  initializeExpandMemCmpLegacyPassPass(Registry);
   initializeExpandPostRAPass(Registry);
   initializeFEntryInserterPass(Registry);
   initializeFinalizeISelPass(Registry);
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 4003a08a5422dd..33562e90e94426 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -108,9 +108,6 @@ static cl::opt<bool> EnableImplicitNullChecks(
     "enable-implicit-null-checks",
     cl::desc("Fold null checks into faulting memory operations"),
     cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableMergeICmps("disable-mergeicmps",
-    cl::desc("Disable MergeICmps Pass"),
-    cl::init(false), cl::Hidden);
 static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
     cl::desc("Print LLVM IR produced by the loop-reduce pass"));
 static cl::opt<bool>
@@ -487,7 +484,6 @@ CGPassBuilderOption llvm::getCGPassBuilderOption() {
   SET_BOOLEAN_OPTION(EnableImplicitNullChecks)
   SET_BOOLEAN_OPTION(EnableMachineOutliner)
   SET_BOOLEAN_OPTION(MISchedPostRA)
-  SET_BOOLEAN_OPTION(DisableMergeICmps)
   SET_BOOLEAN_OPTION(DisableLSR)
   SET_BOOLEAN_OPTION(DisableConstantHoisting)
   SET_BOOLEAN_OPTION(DisableCGP)
@@ -872,13 +868,6 @@ void TargetPassConfig::addIRPasses() {
                                         "\n\n*** Code after LSR ***\n"));
     }
 
-    // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
-    // loads and compares. ExpandMemCmpPass then tries to expand those calls
-    // into optimally-sized loads and compares. The transforms are enabled by a
-    // target lowering hook.
-    if (!DisableMergeICmps)
-      addPass(createMergeICmpsLegacyPass());
-    addPass(createExpandMemCmpLegacyPass());
   }
 
   // Run GC lowering passes for builtin collectors
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 439f749bda8bb7..20448554756aca 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -76,7 +76,6 @@
 #include "llvm/CodeGen/DwarfEHPrepare.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
-#include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/HardwareLoops.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
@@ -181,6 +180,7 @@
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
 #include "llvm/Transforms/Scalar/DivRemPairs.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/ExpandMemCmp.h"
 #include "llvm/Transforms/Scalar/FlattenCFG.h"
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/Transforms/Scalar/GVN.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 5c6c391049a7b2..e2dd413f12d696 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -86,6 +86,7 @@
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
 #include "llvm/Transforms/Scalar/DivRemPairs.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/ExpandMemCmp.h"
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
@@ -111,6 +112,7 @@
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
+#include "llvm/Transforms/Scalar/MergeICmps.h"
 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
 #include "llvm/Transforms/Scalar/NewGVN.h"
 #include "llvm/Transforms/Scalar/Reassociate.h"
@@ -386,6 +388,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   if (AreStatisticsEnabled())
     FPM.addPass(CountVisitsPass());
 
+  FPM.addPass(MergeICmpsPass());
+  FPM.addPass(ExpandMemCmpPass(TM));
   // Form SSA out of local memory accesses after breaking apart aggregates into
   // scalars.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
@@ -532,6 +536,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   if (AreStatisticsEnabled())
     FPM.addPass(CountVisitsPass());
 
+  FPM.addPass(MergeICmpsPass());
+  FPM.addPass(ExpandMemCmpPass(TM));
   // Form SSA out of local memory accesses after breaking apart aggregates into
   // scalars.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 82ce040c649626..31adbf1942b410 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -353,6 +353,7 @@ FUNCTION_PASS("mem2reg", PromotePass())
 FUNCTION_PASS("memcpyopt", MemCpyOptPass())
 FUNCTION_PASS("memprof", MemProfilerPass())
 FUNCTION_PASS("mergeicmps", MergeICmpsPass())
+FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
 FUNCTION_PASS("mergereturn", UnifyFunctionExitNodesPass())
 FUNCTION_PASS("move-auto-init", MoveAutoInitPass())
 FUNCTION_PASS("nary-reassociate", NaryReassociatePass())
@@ -415,7 +416,7 @@ FUNCTION_PASS("structurizecfg", StructurizeCFGPass())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("tlshoist", TLSVariableHoistPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
-FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())  
+FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 2dd27037a17de7..f6e666dd071256 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMScalarOpts
   DeadStoreElimination.cpp
   DFAJumpThreading.cpp
   DivRemPairs.cpp
+  ExpandMemCmp.cpp
   EarlyCSE.cpp
   FlattenCFGPass.cpp
   Float2Int.cpp
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
similarity index 90%
rename from llvm/lib/CodeGen/ExpandMemCmp.cpp
rename to llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
index bb84813569f4d5..973875ee142978 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
@@ -11,21 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/ExpandMemCmp.h"
+#include "llvm/Transforms/Scalar/ExpandMemCmp.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -35,9 +36,6 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-namespace llvm {
-class TargetLowering;
-}
 
 #define DEBUG_TYPE "expand-memcmp"
 
@@ -305,6 +303,7 @@ unsigned MemCmpExpansion::getNumBlocks() {
 }
 
 void MemCmpExpansion::createLoadCmpBlocks() {
+  assert(ResBlock.BB && "ResBlock must be created before LoadCmpBlocks");
   for (unsigned i = 0; i < getNumBlocks(); i++) {
     BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
                                         EndBlock->getParent(), EndBlock);
@@ -313,6 +312,7 @@ void MemCmpExpansion::createLoadCmpBlocks() {
 }
 
 void MemCmpExpansion::createResultBlock() {
+  assert(EndBlock && "EndBlock must be created before ResultBlock");
   ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
                                    EndBlock->getParent(), EndBlock);
 }
@@ -828,9 +828,9 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
 ///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
 ///  ret i32 %phi.res
 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
-                         const TargetLowering *TLI, const DataLayout *DL,
-                         ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
-                         DomTreeUpdater *DTU, const bool IsBCmp) {
+                         const DataLayout *DL, ProfileSummaryInfo *PSI,
+                         BlockFrequencyInfo *BFI, DomTreeUpdater *DTU,
+                         const bool IsBCmp) {
   NumMemCmpCalls++;
 
   // Early exit from expansion if -Oz.
@@ -845,9 +845,7 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
   }
   const uint64_t SizeVal = SizeCast->getZExtValue();
 
-  if (SizeVal == 0) {
-    return false;
-  }
+
   // TTI call to check if target would like to expand memcmp. Also, get the
   // available load sizes.
   const bool IsUsedForZeroCmp =
@@ -857,28 +855,33 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
   auto Options = TTI->enableMemCmpExpansion(OptForSize,
                                             IsUsedForZeroCmp);
   if (!Options) return false;
+  Value *Res = nullptr;
 
-  if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
-    Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
-
-  if (OptForSize &&
-      MaxLoadsPerMemcmpOptSize.getNumOccurrences())
-    Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
+  if (SizeVal == 0) {
+    Res = ConstantInt::get(CI->getFunctionType()->getReturnType(), 0);
+  } else {
+    if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
+      Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
 
-  if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
-    Options.MaxNumLoads = MaxLoadsPerMemcmp;
+    if (OptForSize &&
+        MaxLoadsPerMemcmpOptSize.getNumOccurrences())
+      Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
 
-  MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL, DTU);
+    if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
+      Options.MaxNumLoads = MaxLoadsPerMemcmp;
 
-  // Don't expand if this will require more loads than desired by the target.
-  if (Expansion.getNumLoads() == 0) {
-    NumMemCmpGreaterThanMax++;
-    return false;
-  }
+    MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL, DTU);
 
-  NumMemCmpInlined++;
+    // Don't expand if this will require more loads than desired by the target.
+    if (Expansion.getNumLoads() == 0) {
+      NumMemCmpGreaterThanMax++;
+      return false;
+    }
 
-  if (Value *Res = Expansion.getMemCmpExpansion()) {
+    NumMemCmpInlined++;
+    Res = Expansion.getMemCmpExpansion();
+  }
+  if (Res) {
     // Replace call with result of expansion and erase call.
     CI->replaceAllUsesWith(Res);
     CI->eraseFromParent();
@@ -889,62 +892,18 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
 
 // Returns true if a change was made.
 static bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
-                       const TargetTransformInfo *TTI, const TargetLowering *TL,
+                       const TargetTransformInfo *TTI,
                        const DataLayout &DL, ProfileSummaryInfo *PSI,
                        BlockFrequencyInfo *BFI, DomTreeUpdater *DTU);
 
 static PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
                                  const TargetTransformInfo *TTI,
-                                 const TargetLowering *TL,
                                  ProfileSummaryInfo *PSI,
                                  BlockFrequencyInfo *BFI, DominatorTree *DT);
 
-class ExpandMemCmpLegacyPass : public FunctionPass {
-public:
-  static char ID;
-
-  ExpandMemCmpLegacyPass() : FunctionPass(ID) {
-    initializeExpandMemCmpLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override {
-    if (skipFunction(F)) return false;
-
-    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-    if (!TPC) {
-      return false;
-    }
-    const TargetLowering* TL =
-        TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();
-
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    const TargetTransformInfo *TTI =
-        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-    auto *BFI = (PSI && PSI->hasProfileSummary()) ?
-           &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
-           nullptr;
-    DominatorTree *DT = nullptr;
-    if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
-      DT = &DTWP->getDomTree();
-    auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI, DT);
-    return !PA.areAllPreserved();
-  }
-
-private:
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addRequired<ProfileSummaryInfoWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
-    FunctionPass::getAnalysisUsage(AU);
-  }
-};
 
 bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
-                const TargetTransformInfo *TTI, const TargetLowering *TL,
+                const TargetTransformInfo *TTI,
                 const DataLayout &DL, ProfileSummaryInfo *PSI,
                 BlockFrequencyInfo *BFI, DomTreeUpdater *DTU) {
   for (Instruction &I : BB) {
@@ -955,7 +914,7 @@ bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
     LibFunc Func;
     if (TLI->getLibFunc(*CI, Func) &&
         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
-        expandMemCmp(CI, TTI, TL, &DL, PSI, BFI, DTU, Func == LibFunc_bcmp)) {
+        expandMemCmp(CI, TTI, &DL, PSI, BFI, DTU, Func == LibFunc_bcmp)) {
       return true;
     }
   }
@@ -963,8 +922,7 @@ bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
 }
 
 PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
-                          const TargetTransformInfo *TTI,
-                          const TargetLowering *TL, ProfileSummaryInfo *PSI,
+                          const TargetTransformInfo *TTI, ProfileSummaryInfo *PSI,
                           BlockFrequencyInfo *BFI, DominatorTree *DT) {
   std::optional<DomTreeUpdater> DTU;
   if (DT)
@@ -973,7 +931,7 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
   const DataLayout& DL = F.getParent()->getDataLayout();
   bool MadeChanges = false;
   for (auto BBIt = F.begin(); BBIt != F.end();) {
-    if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {
+    if (runOnBlock(*BBIt, TLI, TTI, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {
       MadeChanges = true;
       // If changes were made, restart the function from the beginning, since
       // the structure of the function was changed.
@@ -996,7 +954,6 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
 
 PreservedAnalyses ExpandMemCmpPass::run(Function &F,
                                         FunctionAnalysisManager &FAM) {
-  const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
   const auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
   const auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
   auto *PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F)
@@ -1005,21 +962,5 @@ PreservedAnalyses ExpandMemCmpPass::run(Function &F,
                                 ? &FAM.getResult<BlockFrequencyAnalysis>(F)
                                 : nullptr;
   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
-
-  return runImpl(F, &TLI, &TTI, TL, PSI, BFI, DT);
-}
-
-char ExpandMemCmpLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ExpandMemCmpLegacyPass, DEBUG_TYPE,
-                      "Expand memcmp() to load/stores", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ExpandMemCmpLegacyPass, DEBUG_TYPE,
-                    "Expand memcmp() to load/stores", false, false)
-
-FunctionPass *llvm::createExpandMemCmpLegacyPass() {
-  return new ExpandMemCmpLegacyPass();
+  return runImpl(F, &TLI, &TTI, PSI, BFI, DT);
 }
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 638f26298ee26a..c96c1edebaf8cc 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -43,13 +43,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
diff --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
deleted file mode 100644
index 4846c46e648178..00000000000000
--- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu                     | FileCheck %s --check-prefix=CHECKN
-; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu -mattr=strict-align | FileCheck %s --check-prefix=CHECKS
-
-declare i32 @bcmp(ptr, ptr, i64) nounwind readonly
-declare i32 @memcmp(ptr, ptr, i64) nounwind readonly
-
-define i1 @test_b2(ptr %s1, ptr %s2) {
-; CHECKN-LABEL: test_b2:
-; CHECKN:       // %bb.0: // %entry
-; CHECKN-NEXT:    ldr x8, [x0]
-; CHECKN-NEXT:    ldr x9, [x1]
-; CHECKN-NEXT:    ldur x10, [x0, #7]
-; CHECKN-NEXT:    ldur x11, [x1, #7]
-; CHECKN-NEXT:    cmp x8, x9
-; CHECKN-NEXT:    ccmp x10, x11, #0, eq
-; CHECKN-NEXT:    cset w0, eq
-; CHECKN-NEXT:    ret
-;
-; CHECKS-LABEL: test_b2:
-; CHECKS:       // %bb.0: // %entry
-; CHECKS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECKS-NEXT:    .cfi_def_cfa_offset 16
-; CHECKS-NEXT:    .cfi_offset w30, -16
-; CHECKS-NEXT:    mov w2, #15 // =0xf
-; CHECKS-NEXT:    bl bcmp
-; CHECKS-NEXT:    cmp w0, #0
-; CHECKS-NEXT:    cset w0, eq
-; CHECKS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECKS-NEXT:    ret
-entry:
-  %bcmp = call i32 @bcmp(ptr %s1, ptr %s2, i64 15)
-  %ret = icmp eq i32 %bcmp, 0
-  ret i1 %ret
-}
-
-; TODO: Four loads should be within the limit, but the heuristic isn't implemented.
-define i1 @test_b2_align8(ptr align 8 %s1, ptr align 8 %s2) {
-; CHECKN-LABEL: test_b2_align8:
-; CHECKN:       // %bb.0: // %entry
-; CHECKN-NEXT:    ldr x8, [x0]
-; CHECKN-NEXT:    ldr x9, [x1]
-; CHECKN-NEXT:    ldur x10, [x0, #7]
-; CHECKN-NEXT:    ldur x11, [x1, #7]
-; CHECKN-NEXT:    cmp x8, x9
-; CHECKN-NEXT:    ccmp x10, x11, #0, eq
-; CHECKN-NEXT:    cset w0, eq
-; CHECKN-NEXT:    ret
-;
-; CHECKS-LABEL: test_b2_align8:
-; CHECKS:       // %bb.0: // %entry
-; CHECKS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECKS-NEXT:    .cfi_def_cfa_offset 16
-; CHECKS-NEXT:    .cfi_offset w30, -16
-; CHECKS-NEXT:    mov w2, #15 // =0xf
-; CHECKS-NEXT:    bl bcmp
-; CHECKS-NEXT:    cmp w0, #0
-; CHECKS-NEXT:    cset w0, eq
-; CHECKS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECKS-NEXT:    ret
-entry:
-  %bcmp = call i32 @bcmp(ptr %s1, ptr %s2, i64 15)
-  %ret = icmp eq i32 %bcmp, 0
-  ret i1 %ret
-}
-
-define i1 @test_bs(ptr %s1, ptr %s2) optsize {
-; CHECKN-LABEL: test_bs:
-; CHECKN:       // %bb.0: // %entry
-; CHECKN-NEXT:    ldp x8, x11, [x1]
-; CHECKN-NEXT:    ldr x12, [x0, #16]
-; CHECKN-NEXT:    ldp x9, x10, [x0]
-; CHECKN-NEXT:    ldr x13, [x1, #16]
-; CHECKN-NEXT:    cmp x9, x8
-; CHECKN-NEXT:    ldur x8, [x0, #23]
-; CHECKN-NEXT:    ldur x9, [x1, #23]
-; CHECKN-NEXT:    ccmp x10, x11, #0, eq
-; CHECKN-NEXT:    ccmp x12, x13, #0, eq
-; CHECKN-NEXT:    ccmp x8, x9, #0, eq
-; CHECKN-NEXT:    cset w0, eq
-; CHECKN-NEXT:    ret
-;
-; CHECKS-LABEL: test_bs:
-; CHECKS:       // %bb.0: // %entry
-; CHECKS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECKS-NEXT:    .cfi_def_cfa_offset 16
-; CHECKS-NEXT:    .cfi_offset w30, -16
-; CHECKS-NEXT:    mov w2, #31 // =0x1f
-; CHECKS-NEXT:    bl memcmp
-; CHECKS-NEXT:    cmp w0, #0
-; CHECKS-NEXT:    cset w0, eq
-; CHECKS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECKS-NEXT:    ret
-entry:
-  %memcmp = call i32 @memcmp(ptr %s1, ptr %s2, i64 31)
-  %ret = icmp eq i32 %memcmp, 0
-  ret i1 %ret
-}
diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll
deleted file mode 100644
index fee52ead989629..00000000000000
--- a/llvm/test/CodeGen/AArch64/bcmp.ll
+++ /dev/null
@@ -1,537 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu                     | FileCheck %s
-
-declare i32 @bcmp(ptr, ptr, i64)
-
-define i1 @bcmp0(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp0:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 0)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp1(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 1)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp2(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 2)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (and (xor a, b), C1), (and (xor c, d), C2)
-define i1 @bcmp3(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp3:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #2]
-; CHECK-NEXT:    ldrb w11, [x1, #2]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 3)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp4(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 4)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (xor a, b), (and (xor c, d), C2)
-define i1 @bcmp5(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp5:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #4]
-; CHECK-NEXT:    ldrb w11, [x1, #4]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 5)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (xor a, b), (and (xor c, d), C2)
-define i1 @bcmp6(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp6:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #4]
-; CHECK-NEXT:    ldrh w11, [x1, #4]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 6)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (xor a, b), (xor c, d)
-define i1 @bcmp7(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp7:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldur w10, [x0, #3]
-; CHECK-NEXT:    ldur w11, [x1, #3]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 7)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp8(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 8)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (xor a, b), (and (xor c, d), C2)
-define i1 @bcmp9(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp9:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #8]
-; CHECK-NEXT:    ldrb w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 9)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp10(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp10:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #8]
-; CHECK-NEXT:    ldrh w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 10)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp11(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp11:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #3]
-; CHECK-NEXT:    ldur x11, [x1, #3]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 11)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp12(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp12:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldr w10, [x0, #8]
-; CHECK-NEXT:    ldr w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 12)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp13(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp13:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #5]
-; CHECK-NEXT:    ldur x11, [x1, #5]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 13)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp14(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp14:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #6]
-; CHECK-NEXT:    ldur x11, [x1, #6]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 14)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp15(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp15:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #7]
-; CHECK-NEXT:    ldur x11, [x1, #7]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 15)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp16(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 16)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp20(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp20:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr w12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr w13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 20)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp24(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp24:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 24)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp28(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp28:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldr w8, [x0, #24]
-; CHECK-NEXT:    ldr w9, [x1, #24]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 28)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp33(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp33:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ldrb w10, [x0, #32]
-; CHECK-NEXT:    ldrb w11, [x1, #32]
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 33)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp38(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp38:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ldur x10, [x0, #30]
-; CHECK-NEXT:    ldur x11, [x1, #30]
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 38)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp45(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp45:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ldr x10, [x0, #32]
-; CHECK-NEXT:    ldr x11, [x1, #32]
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldur x8, [x0, #37]
-; CHECK-NEXT:    ldur x12, [x1, #37]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 45)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; Although the large cmp chain may be not profitable on high end CPU, we
-; believe it is better on most cpus, so perform the transform now.
-; 8 xor + 7 or + 1 cmp only need 6 cycles on a 4 width ALU port machine
-;   2 cycle for xor
-;   3 cycle for or
-;   1 cycle for cmp
-define i1 @bcmp64(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ldp x9, x13, [x1, #48]
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ldp x8, x10, [x0, #48]
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    ccmp x10, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 64)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp89(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp89:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w2, #89 // =0x59
-; CHECK-NEXT:    bl bcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 89)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp_zext(i32 %0, i32 %1, i8 %2, i8 %3) {
-; CHECK-LABEL: bcmp_zext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0xff
-; CHECK-NEXT:    and w9, w3, #0xff
-; CHECK-NEXT:    cmp w1, w0
-; CHECK-NEXT:    ccmp w9, w8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %5 = xor i32 %1, %0
-  %6 = xor i8 %3, %2
-  %7 = zext i8 %6 to i32
-  %8 = or i32 %5, %7
-  %9 = icmp eq i32 %8, 0
-  ret i1 %9
-}
-
-define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) {
-; CHECK-LABEL: bcmp_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xff
-; CHECK-NEXT:    and w9, w2, #0xff
-; CHECK-NEXT:    and w10, w3, #0xff
-; CHECK-NEXT:    cmp w8, w0, uxtb
-; CHECK-NEXT:    and w8, w4, #0xff
-; CHECK-NEXT:    and w11, w5, #0xff
-; CHECK-NEXT:    ccmp w10, w9, #0, eq
-; CHECK-NEXT:    ccmp w11, w8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %xor0 = xor i8 %b0, %a0
-  %xor1 = xor i8 %b1, %a1
-  %xor2 = xor i8 %b2, %a2
-  %or0 = or i8 %xor0, %xor1
-  %or1 = or i8 %or0, %xor2
-  %r = icmp eq i8 %or1, 0
-  ret i1 %r
-}
-
-define i1 @bcmp_i16(i16 %a0, i16 %b0, i16 %a1, i16 %b1, i16 %a2, i16 %b2) {
-; CHECK-LABEL: bcmp_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xffff
-; CHECK-NEXT:    and w9, w2, #0xffff
-; CHECK-NEXT:    and w10, w3, #0xffff
-; CHECK-NEXT:    cmp w8, w0, uxth
-; CHECK-NEXT:    and w8, w4, #0xffff
-; CHECK-NEXT:    and w11, w5, #0xffff
-; CHECK-NEXT:    ccmp w10, w9, #0, eq
-; CHECK-NEXT:    ccmp w11, w8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %xor0 = xor i16 %b0, %a0
-  %xor1 = xor i16 %b1, %a1
-  %xor2 = xor i16 %b2, %a2
-  %or0 = or i16 %xor0, %xor1
-  %or1 = or i16 %or0, %xor2
-  %r = icmp eq i16 %or1, 0
-  ret i1 %r
-}
-
-define i1 @bcmp_i128(i128 %a0, i128 %b0, i128 %a1, i128 %b1, i128 %a2, i128 %b2) {
-; CHECK-LABEL: bcmp_i128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x2, x0
-; CHECK-NEXT:    ldp x8, x10, [sp]
-; CHECK-NEXT:    ccmp x3, x1, #0, eq
-; CHECK-NEXT:    ldp x9, x11, [sp, #16]
-; CHECK-NEXT:    ccmp x6, x4, #0, eq
-; CHECK-NEXT:    ccmp x7, x5, #0, eq
-; CHECK-NEXT:    cset w12, ne
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    csinc w0, w12, wzr, eq
-; CHECK-NEXT:    ret
-  %xor0 = xor i128 %b0, %a0
-  %xor1 = xor i128 %b1, %a1
-  %xor2 = xor i128 %b2, %a2
-  %or0 = or i128 %xor0, %xor1
-  %or1 = or i128 %or0, %xor2
-  %r = icmp ne i128 %or1, 0
-  ret i1 %r
-}
-
-define i1 @bcmp_i42(i42 %a0, i42 %b0, i42 %a1, i42 %b1, i42 %a2, i42 %b2) {
-; CHECK-LABEL: bcmp_i42:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x3ffffffffff
-; CHECK-NEXT:    and x9, x1, #0x3ffffffffff
-; CHECK-NEXT:    and x10, x2, #0x3ffffffffff
-; CHECK-NEXT:    and x11, x3, #0x3ffffffffff
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    and x8, x4, #0x3ffffffffff
-; CHECK-NEXT:    and x9, x5, #0x3ffffffffff
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %xor0 = xor i42 %b0, %a0
-  %xor1 = xor i42 %b1, %a1
-  %xor2 = xor i42 %b2, %a2
-  %or0 = or i42 %xor0, %xor1
-  %or1 = or i42 %or0, %xor2
-  %r = icmp ne i42 %or1, 0
-  ret i1 %r
-}
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index a48a4e0e723ebc..855a5b23f6c1cc 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define i1 @combine_setcc_eq_vecreduce_or_v8i1(<8 x i8> %a) {
@@ -266,8 +266,18 @@ define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
-  %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
-  %cmp = icmp eq i32 %bcmp, 0
+  %1 = load i64, ptr %a, align 1
+  %2 = load i64, ptr %b, align 1
+  %3 = xor i64 %1, %2
+  %4 = getelementptr i8, ptr %a, i64 8
+  %5 = getelementptr i8, ptr %b, i64 8
+  %6 = load i64, ptr %4, align 1
+  %7 = load i64, ptr %5, align 1
+  %8 = xor i64 %6, %7
+  %9 = or i64 %3, %8
+  %10 = icmp ne i64 %9, 0
+  %11 = zext i1 %10 to i32
+  %cmp = icmp eq i32 %11, 0
   ret i1 %cmp
 }
 
@@ -280,9 +290,18 @@ define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
-  %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
-  %cmp = icmp ne i32 %bcmp, 0
-  ret i1 %cmp
+  %1 = load i64, ptr %a, align 1
+  %2 = load i64, ptr %b, align 1
+  %3 = xor i64 %1, %2
+  %4 = getelementptr i8, ptr %a, i64 8
+  %5 = getelementptr i8, ptr %b, i64 8
+  %6 = load i64, ptr %4, align 1
+  %7 = load i64, ptr %5, align 1
+  %8 = xor i64 %6, %7
+  %9 = or i64 %3, %8
+  %10 = icmp ne i64 %9, 0
+  %11 = zext i1 %10 to i32
+  ret i1 %10
 }
 
 ; Doesn't increase the number of instructions, where the LHS has multiple uses
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index 30123a31cebbe9..fc0bc1b9661163 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -25,20 +25,23 @@ define i64 @one_dimensional(ptr %a, ptr %b, i64 %N) {
 entry:
   br label %for.body
 
-for.body:                                         ; preds = %entry, %for.body
+for.body:                                         ; preds = %for.body, %entry
   %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
   %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
   %0 = load ptr, ptr %arrayidx, align 8
-  %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4)
-  %tobool = icmp eq i32 %bcmp, 0
+  %1 = load i32, ptr %0, align 1
+  %2 = load i32, ptr %b, align 1
+  %3 = icmp ne i32 %1, %2
+  %4 = zext i1 %3 to i32
+  %tobool = icmp eq i32 %4, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.05, %add
   %inc = add nuw i64 %i.06, 1
   %exitcond = icmp eq i64 %inc, %N
   br i1 %exitcond, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret i64 %spec.select
 }
 
@@ -79,32 +82,35 @@ define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M) {
 entry:
   br label %for.cond1.preheader
 
-for.cond1.preheader:                           ; preds = %entry, %for.cond1.for.exit3_crit_edge
+for.cond1.preheader:                              ; preds = %for.cond1.for.exit3_crit_edge, %entry
   %i.019 = phi i64 [ %inc7, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ]
   %sum.018 = phi i64 [ %spec.select, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.019
   %0 = load ptr, ptr %arrayidx, align 8
   br label %for.body4
 
-for.body4:                                     ; preds = %for.cond1.preheader, %for.body4
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
   %j.016 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ]
   %sum.115 = phi i64 [ %sum.018, %for.cond1.preheader ], [ %spec.select, %for.body4 ]
   %arrayidx5 = getelementptr inbounds ptr, ptr %0, i64 %j.016
   %1 = load ptr, ptr %arrayidx5, align 8
-  %bcmp = tail call i32 @bcmp(ptr %1, ptr %b, i64 4)
-  %tobool = icmp eq i32 %bcmp, 0
+  %2 = load i32, ptr %1, align 1
+  %3 = load i32, ptr %b, align 1
+  %4 = icmp ne i32 %2, %3
+  %5 = zext i1 %4 to i32
+  %tobool = icmp eq i32 %5, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.115, %add
   %inc = add nuw i64 %j.016, 1
   %exitcond = icmp eq i64 %inc, %M
   br i1 %exitcond, label %for.cond1.for.exit3_crit_edge, label %for.body4
 
-for.cond1.for.exit3_crit_edge:         ; preds = %for.body4
+for.cond1.for.exit3_crit_edge:                    ; preds = %for.body4
   %inc7 = add nuw i64 %i.019, 1
   %exitcond22 = icmp eq i64 %inc7, %N
   br i1 %exitcond22, label %for.exit, label %for.cond1.preheader
 
-for.exit:                                 ; preds = %for.cond1.for.exit3_crit_edge
+for.exit:                                         ; preds = %for.cond1.for.exit3_crit_edge
   ret i64 %spec.select
 }
 
@@ -159,44 +165,47 @@ define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
 entry:
   br label %for.cond1.preheader
 
-for.cond1.preheader:                        ; preds = %entry, %for.cond1.for.cond
+for.cond1.preheader:                              ; preds = %for.cond1.for.cond, %entry
   %i.033 = phi i64 [ %inc15, %for.cond1.for.cond ], [ 0, %entry ]
   %sum.032 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.033
   %0 = load ptr, ptr %arrayidx, align 8
   br label %for.cond5.preheader
 
-for.cond5.preheader:                     ; preds = %for.cond5.for.cond, %for.cond1.preheader
+for.cond5.preheader:                              ; preds = %for.cond5.for.cond, %for.cond1.preheader
   %j.029 = phi i64 [ 0, %for.cond1.preheader ], [ %inc12, %for.cond5.for.cond ]
   %sum.128 = phi i64 [ %sum.032, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
   %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.029
   %1 = load ptr, ptr %arrayidx9, align 8
   br label %for.body8
 
-for.body8:                               ; preds = %for.body8, %for.cond5.preheader
+for.body8:                                        ; preds = %for.body8, %for.cond5.preheader
   %k.026 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
   %sum.225 = phi i64 [ %sum.128, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
   %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.026
   %2 = load ptr, ptr %arrayidx10, align 8
-  %bcmp = tail call i32 @bcmp(ptr %2, ptr %b, i64 4)
-  %tobool = icmp eq i32 %bcmp, 0
+  %3 = load i32, ptr %2, align 1
+  %4 = load i32, ptr %b, align 1
+  %5 = icmp ne i32 %3, %4
+  %6 = zext i1 %5 to i32
+  %tobool = icmp eq i32 %6, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.225, %add
   %inc = add nuw i64 %k.026, 1
   %exitcond = icmp eq i64 %inc, %K
   br i1 %exitcond, label %for.cond5.for.cond, label %for.body8
 
-for.cond5.for.cond:   ; preds = %for.body8
+for.cond5.for.cond:                               ; preds = %for.body8
   %inc12 = add nuw i64 %j.029, 1
   %exitcond44 = icmp eq i64 %inc12, %M
   br i1 %exitcond44, label %for.cond1.for.cond, label %for.cond5.preheader
 
-for.cond1.for.cond: ; preds = %for.cond5.for.cond
+for.cond1.for.cond:                               ; preds = %for.cond5.for.cond
   %inc15 = add nuw i64 %i.033, 1
   %exitcond45 = icmp eq i64 %inc15, %N
   br i1 %exitcond45, label %for.exit, label %for.cond1.preheader
 
-for.exit:                                 ; preds = %for.cond1.for.cond
+for.exit:                                         ; preds = %for.cond1.for.cond
   ret i64 %spec.select
 }
 
@@ -254,14 +263,14 @@ define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
 entry:
   br label %for.cond1.preheader
 
-for.cond1.preheader:                        ; preds = %entry, %for.cond1.for.cond
+for.cond1.preheader:                              ; preds = %for.cond1.for.cond, %entry
   %i.035 = phi i64 [ %inc16, %for.cond1.for.cond ], [ 0, %entry ]
   %sum.034 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.035
   %0 = load ptr, ptr %arrayidx, align 8
   br label %for.cond5.preheader
 
-for.cond5.preheader:                     ; preds = %for.cond5.for.cond, %for.cond1.preheader
+for.cond5.preheader:                              ; preds = %for.cond5.for.cond, %for.cond1.preheader
   %j.031 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.cond5.for.cond ]
   %sum.130 = phi i64 [ %sum.034, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
   %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.031
@@ -270,30 +279,33 @@ for.cond5.preheader:                     ; preds = %for.cond5.for.cond, %for.con
   %2 = load ptr, ptr %arrayidx11, align 8
   br label %for.body8
 
-for.body8:                               ; preds = %for.body8, %for.cond5.preheader
+for.body8:                                        ; preds = %for.body8, %for.cond5.preheader
   %k.028 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
   %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
   %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028
   %3 = load ptr, ptr %arrayidx10, align 8
-  %bcmp = tail call i32 @bcmp(ptr %3, ptr %2, i64 4)
-  %tobool = icmp eq i32 %bcmp, 0
+  %4 = load i32, ptr %3, align 1
+  %5 = load i32, ptr %2, align 1
+  %6 = icmp ne i32 %4, %5
+  %7 = zext i1 %6 to i32
+  %tobool = icmp eq i32 %7, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.227, %add
   %inc = add nuw i64 %k.028, 1
   %exitcond = icmp eq i64 %inc, %K
   br i1 %exitcond, label %for.cond5.for.cond, label %for.body8
 
-for.cond5.for.cond:   ; preds = %for.body8
+for.cond5.for.cond:                               ; preds = %for.body8
   %inc13 = add nuw i64 %j.031, 1
   %exitcond46 = icmp eq i64 %inc13, %M
   br i1 %exitcond46, label %for.cond1.for.cond, label %for.cond5.preheader
 
-for.cond1.for.cond: ; preds = %for.cond5.for.cond
+for.cond1.for.cond:                               ; preds = %for.cond5.for.cond
   %inc16 = add nuw i64 %i.035, 1
   %exitcond47 = icmp eq i64 %inc16, %N
   br i1 %exitcond47, label %for.exit, label %for.cond1.preheader
 
-for.exit:                                 ; preds = %for.cond1.for.cond
+for.exit:                                         ; preds = %for.cond1.for.cond
   ret i64 %spec.select
 }
 
@@ -328,19 +340,27 @@ for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
+for.body:                                         ; preds = %for.body, %for.body.preheader
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
   %0 = load ptr, ptr %arrayidx, align 8
-  %call = tail call i32 @memcmp(ptr %0, ptr %b, i64 4)
-  %conv = trunc i32 %call to i8
+  %1 = load i32, ptr %0, align 1
+  %2 = load i32, ptr %b, align 1
+  %3 = call i32 @llvm.bswap.i32(i32 %1)
+  %4 = call i32 @llvm.bswap.i32(i32 %2)
+  %5 = icmp ugt i32 %3, %4
+  %6 = icmp ult i32 %3, %4
+  %7 = zext i1 %5 to i32
+  %8 = zext i1 %6 to i32
+  %9 = sub i32 %7, %8
+  %conv = trunc i32 %9 to i8
   %arrayidx2 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
   store i8 %conv, ptr %arrayidx2, align 1
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret void
 }
 
@@ -385,13 +405,16 @@ for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
+for.body:                                         ; preds = %for.body, %for.body.preheader
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
   %sum.05 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
   %0 = load ptr, ptr %arrayidx, align 8
-  %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4)
-  %tobool.not = icmp eq i32 %bcmp, 0
+  %1 = load i32, ptr %0, align 1
+  %2 = load i32, ptr %b, align 1
+  %3 = icmp ne i32 %1, %2
+  %4 = zext i1 %3 to i32
+  %tobool.not = icmp eq i32 %4, 0
   %add = zext i1 %tobool.not to i32
   %spec.select = add nuw nsw i32 %sum.05, %add
   tail call void @func()
@@ -399,7 +422,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret i32 %spec.select
 }
 
@@ -431,20 +454,32 @@ define i64 @one_dimensional_two_loads(ptr %a, ptr %b, i64 %N) {
 entry:
   br label %for.body
 
-for.body:                                         ; preds = %entry, %for.body
+for.body:                                         ; preds = %for.body, %entry
   %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
   %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
   %0 = load ptr, ptr %arrayidx, align 8
-  %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 6)
-  %tobool = icmp eq i32 %bcmp, 0
+  %1 = load i32, ptr %0, align 1
+  %2 = load i32, ptr %b, align 1
+  %3 = xor i32 %1, %2
+  %4 = getelementptr i8, ptr %0, i64 4
+  %5 = getelementptr i8, ptr %b, i64 4
+  %6 = load i16, ptr %4, align 1
+  %7 = load i16, ptr %5, align 1
+  %8 = zext i16 %6 to i32
+  %9 = zext i16 %7 to i32
+  %10 = xor i32 %8, %9
+  %11 = or i32 %3, %10
+  %12 = icmp ne i32 %11, 0
+  %13 = zext i1 %12 to i32
+  %tobool = icmp eq i32 %13, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.05, %add
   %inc = add nuw i64 %i.06, 1
   %exitcond = icmp eq i64 %inc, %N
   br i1 %exitcond, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret i64 %spec.select
 }
 
@@ -475,18 +510,18 @@ define i64 @hoisting_no_cse(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; CHECK-NEXT:    mov x0, x8
 ; CHECK-NEXT:    ret
 entry:
-  %b.val = load i64, ptr %b
+  %b.val = load i64, ptr %b, align 8
   %b.val.changed = add i64 %b.val, 1
-  store i64 %b.val.changed, ptr %c
+  store i64 %b.val.changed, ptr %c, align 8
   br label %for.body
 
-for.body:                                         ; preds = %entry, %for.body
+for.body:                                         ; preds = %for.body, %entry
   %idx = phi i64 [ %inc, %for.body ], [ 0, %entry ]
   %sum = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %idx
   %0 = load ptr, ptr %arrayidx, align 8
-  %x = load i64, ptr %0
-  %y = load i64, ptr %b
+  %x = load i64, ptr %0, align 8
+  %y = load i64, ptr %b, align 8
   %cmp = icmp eq i64 %x, %y
   %add = zext i1 %cmp to i64
   %spec.select = add i64 %sum, %add
@@ -494,10 +529,15 @@ for.body:                                         ; preds = %entry, %for.body
   %exitcond = icmp eq i64 %inc, %N
   br i1 %exitcond, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret i64 %spec.select
 }
 
 declare i32 @bcmp(ptr, ptr, i64)
 declare i32 @memcmp(ptr, ptr, i64)
 declare void @func()
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.bswap.i32(i32) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
deleted file mode 100644
index 4da7c8c95a4e4f..00000000000000
--- a/llvm/test/CodeGen/AArch64/memcmp.ll
+++ /dev/null
@@ -1,3029 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length0:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length0_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length0_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w0, w8, w9, lsr #16
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i32 @length2_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    add w0, w8, w9, lsr #16
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_gt_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    add w8, w8, w9, lsr #16
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w8, w8, w9, lsr #16
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w8, w8, w9, lsr #16
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length2_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    mov w9, #12849 // =0x3231
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_eq_nobuiltin_attr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #2 // =0x2
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length3:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #2]
-; CHECK-NEXT:    ldrh w11, [x1]
-; CHECK-NEXT:    orr w8, w9, w8, lsl #16
-; CHECK-NEXT:    orr w9, w11, w10, lsl #16
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length3_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #2]
-; CHECK-NEXT:    ldrb w11, [x1, #2]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4_lt_32(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_lt_32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = lshr i32 %m, 31
-  ret i32 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, hi
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length4_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    mov w9, #12849 // =0x3231
-; CHECK-NEXT:    movk w9, #13363, lsl #16
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length5:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length5_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #4]
-; CHECK-NEXT:    ldrb w11, [x1, #4]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length5_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length6(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length6:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrh w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
-  ret i32 %m
-}
-
-define i32 @length6_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length6_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrh w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
-  %r = lshr i32 %m, 31
-  ret i32 %r
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length7:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB24_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur w8, [x0, #3]
-; CHECK-NEXT:    ldur w9, [x1, #3]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB24_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB24_3: // %res_block
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length7_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB25_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur w8, [x0, #3]
-; CHECK-NEXT:    ldur w9, [x1, #3]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB25_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB25_3: // %res_block
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length7_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldur w10, [x0, #3]
-; CHECK-NEXT:    ldur w11, [x1, #3]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length8_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length8_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #12592 // =0x3130
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    movk x9, #13106, lsl #16
-; CHECK-NEXT:    movk x9, #13620, lsl #32
-; CHECK-NEXT:    movk x9, #14134, lsl #48
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length9(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length9:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB30_2
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldrb w8, [x0, #8]
-; CHECK-NEXT:    ldrb w9, [x1, #8]
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB30_2: // %res_block
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  ret i32 %m
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length9_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #8]
-; CHECK-NEXT:    ldrb w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length10(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length10:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldrh w8, [x0, #8]
-; CHECK-NEXT:    ldrh w9, [x1, #8]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB32_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  ret i32 %m
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length10_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #8]
-; CHECK-NEXT:    ldrh w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length11(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length11:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB34_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur x8, [x0, #3]
-; CHECK-NEXT:    ldur x9, [x1, #3]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB34_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB34_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  ret i32 %m
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length11_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #3]
-; CHECK-NEXT:    ldur x11, [x1, #3]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length12_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldr w10, [x0, #8]
-; CHECK-NEXT:    ldr w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length12:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB37_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr w8, [x0, #8]
-; CHECK-NEXT:    ldr w9, [x1, #8]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB37_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB37_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length13_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #5]
-; CHECK-NEXT:    ldur x11, [x1, #5]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length14_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #6]
-; CHECK-NEXT:    ldur x11, [x1, #6]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB40_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur x8, [x0, #7]
-; CHECK-NEXT:    ldur x9, [x1, #7]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB40_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB40_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB41_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur x8, [x0, #7]
-; CHECK-NEXT:    ldur x9, [x1, #7]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB41_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB41_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #14136 // =0x3738
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    movk x8, #13622, lsl #16
-; CHECK-NEXT:    movk x8, #13108, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #12594, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    mov x8, #13365 // =0x3435
-; CHECK-NEXT:    ldur x9, [x0, #7]
-; CHECK-NEXT:    movk x8, #12851, lsl #16
-; CHECK-NEXT:    movk x8, #12337, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #14393, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB42_3: // %res_block
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #7]
-; CHECK-NEXT:    ldur x11, [x1, #7]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_gt_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #14136 // =0x3738
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    movk x8, #13622, lsl #16
-; CHECK-NEXT:    movk x8, #13108, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #12594, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB44_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    mov x8, #13365 // =0x3435
-; CHECK-NEXT:    ldur x9, [x0, #7]
-; CHECK-NEXT:    movk x8, #12851, lsl #16
-; CHECK-NEXT:    movk x8, #12337, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #14393, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB44_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB44_4
-; CHECK-NEXT:  .LBB44_3: // %res_block
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB44_4: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB45_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length16_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length16_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB47_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB47_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB47_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length16_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB48_4
-; CHECK-NEXT:  .LBB48_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB48_4: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length16_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length24:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB50_4: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length24_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length24_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB52_4: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length24_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB53_5
-; CHECK-NEXT:  .LBB53_4: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB53_5: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length24_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldr x11, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    mov x8, #14134 // =0x3736
-; CHECK-NEXT:    movk x8, #14648, lsl #16
-; CHECK-NEXT:    movk x8, #12592, lsl #32
-; CHECK-NEXT:    movk x8, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length31:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB55_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length31_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length31_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB57_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length31_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB58_6
-; CHECK-NEXT:  .LBB58_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB58_6: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; CHECK-LABEL: length31_eq_prefer128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length31_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldr x11, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    ldur x9, [x0, #23]
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    mov x8, #14134 // =0x3736
-; CHECK-NEXT:    movk x8, #14648, lsl #16
-; CHECK-NEXT:    movk x8, #12592, lsl #32
-; CHECK-NEXT:    movk x8, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    mov x8, #13363 // =0x3433
-; CHECK-NEXT:    movk x8, #13877, lsl #16
-; CHECK-NEXT:    movk x8, #14391, lsl #32
-; CHECK-NEXT:    movk x8, #12345, lsl #48
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB61_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length32_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length32_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB63_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length32_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB64_6
-; CHECK-NEXT:  .LBB64_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB64_6: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; CHECK-LABEL: length32_eq_prefer128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length32_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    ldp x9, x11, [x0, #16]
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    mov x8, #14134 // =0x3736
-; CHECK-NEXT:    movk x8, #14648, lsl #16
-; CHECK-NEXT:    movk x8, #12592, lsl #32
-; CHECK-NEXT:    movk x8, #13106, lsl #48
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
-; CHECK-NEXT:    mov x8, #13620 // =0x3534
-; CHECK-NEXT:    movk x8, #14134, lsl #16
-; CHECK-NEXT:    movk x8, #14648, lsl #32
-; CHECK-NEXT:    movk x8, #12592, lsl #48
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length48:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB67_7: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length48_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length48_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB69_7: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length48_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB70_8
-; CHECK-NEXT:  .LBB70_7: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB70_8: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; CHECK-LABEL: length48_eq_prefer128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length48_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldp x11, x12, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x9, #14648 // =0x3938
-; CHECK-NEXT:    movk x9, #12592, lsl #16
-; CHECK-NEXT:    movk x9, #13106, lsl #32
-; CHECK-NEXT:    movk x9, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    mov x9, #14134 // =0x3736
-; CHECK-NEXT:    movk x9, #14648, lsl #16
-; CHECK-NEXT:    movk x9, #12592, lsl #32
-; CHECK-NEXT:    movk x9, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x9, #0, eq
-; CHECK-NEXT:    mov x9, #13620 // =0x3534
-; CHECK-NEXT:    movk x9, #14134, lsl #16
-; CHECK-NEXT:    ldp x10, x11, [x0, #32]
-; CHECK-NEXT:    movk x9, #14648, lsl #32
-; CHECK-NEXT:    movk x9, #12592, lsl #48
-; CHECK-NEXT:    ccmp x12, x9, #0, eq
-; CHECK-NEXT:    mov x9, #13106 // =0x3332
-; CHECK-NEXT:    movk x9, #13620, lsl #16
-; CHECK-NEXT:    movk x9, #14134, lsl #32
-; CHECK-NEXT:    movk x9, #14648, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length63:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x9, [x1, #55]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB73_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length63_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ldr x9, [x0, #48]
-; CHECK-NEXT:    ldr x13, [x1, #48]
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x10, [x1, #55]
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length63_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x9, [x1, #55]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB75_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length63_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x9, [x1, #55]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB76_10
-; CHECK-NEXT:  .LBB76_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB76_10: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length63_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldp x11, x12, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x9, #14648 // =0x3938
-; CHECK-NEXT:    movk x9, #12592, lsl #16
-; CHECK-NEXT:    movk x9, #13106, lsl #32
-; CHECK-NEXT:    movk x9, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    mov x10, #14134 // =0x3736
-; CHECK-NEXT:    movk x10, #14648, lsl #16
-; CHECK-NEXT:    movk x10, #12592, lsl #32
-; CHECK-NEXT:    movk x10, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    mov x10, #13620 // =0x3534
-; CHECK-NEXT:    movk x10, #14134, lsl #16
-; CHECK-NEXT:    ldp x11, x13, [x0, #32]
-; CHECK-NEXT:    movk x10, #14648, lsl #32
-; CHECK-NEXT:    movk x10, #12592, lsl #48
-; CHECK-NEXT:    ccmp x12, x10, #0, eq
-; CHECK-NEXT:    mov x10, #13106 // =0x3332
-; CHECK-NEXT:    ldr x12, [x0, #48]
-; CHECK-NEXT:    movk x10, #13620, lsl #16
-; CHECK-NEXT:    movk x10, #14134, lsl #32
-; CHECK-NEXT:    movk x10, #14648, lsl #48
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    ldur x10, [x0, #55]
-; CHECK-NEXT:    ccmp x13, x8, #0, eq
-; CHECK-NEXT:    mov x8, #13877 // =0x3635
-; CHECK-NEXT:    movk x8, #14391, lsl #16
-; CHECK-NEXT:    ccmp x12, x9, #0, eq
-; CHECK-NEXT:    movk x8, #12345, lsl #32
-; CHECK-NEXT:    movk x8, #12849, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldr x8, [x0, #56]
-; CHECK-NEXT:    ldr x9, [x1, #56]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB78_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length64_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ldp x9, x13, [x1, #48]
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ldp x8, x10, [x0, #48]
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    ccmp x10, x13, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length64_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldr x8, [x0, #56]
-; CHECK-NEXT:    ldr x9, [x1, #56]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB80_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length64_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldr x8, [x0, #56]
-; CHECK-NEXT:    ldr x9, [x1, #56]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB81_10
-; CHECK-NEXT:  .LBB81_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB81_10: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length64_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldp x11, x12, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    ldp x13, x14, [x0, #32]
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x9, #14648 // =0x3938
-; CHECK-NEXT:    movk x9, #12592, lsl #16
-; CHECK-NEXT:    movk x9, #13106, lsl #32
-; CHECK-NEXT:    movk x9, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    mov x10, #14134 // =0x3736
-; CHECK-NEXT:    movk x10, #14648, lsl #16
-; CHECK-NEXT:    movk x10, #12592, lsl #32
-; CHECK-NEXT:    movk x10, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    mov x11, #13620 // =0x3534
-; CHECK-NEXT:    movk x11, #14134, lsl #16
-; CHECK-NEXT:    movk x11, #14648, lsl #32
-; CHECK-NEXT:    movk x11, #12592, lsl #48
-; CHECK-NEXT:    ccmp x12, x11, #0, eq
-; CHECK-NEXT:    mov x11, #13106 // =0x3332
-; CHECK-NEXT:    movk x11, #13620, lsl #16
-; CHECK-NEXT:    movk x11, #14134, lsl #32
-; CHECK-NEXT:    movk x11, #14648, lsl #48
-; CHECK-NEXT:    ccmp x13, x11, #0, eq
-; CHECK-NEXT:    ldp x11, x12, [x0, #48]
-; CHECK-NEXT:    ccmp x14, x8, #0, eq
-; CHECK-NEXT:    ccmp x11, x9, #0, eq
-; CHECK-NEXT:    ccmp x12, x10, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length96:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length96_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length96_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length96_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length96_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length127:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length127_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length127_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length127_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length127_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length128_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length128_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length128_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length128_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length192:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length192_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length192_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length192_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length192_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length255:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length255_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length255_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length255_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length255_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length256:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length256_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length256_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length256_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length256_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length384:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length384_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length384_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length384_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length384_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length511:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length511_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length511_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length511_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length511_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length512:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length512_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length512_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length512_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length512_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: huge_length:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: huge_length_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
-; CHECK-LABEL: nonconst_length:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
-; CHECK-LABEL: nonconst_length_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 8b0b6263832243..84210ec410d29f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -203,13 +203,6 @@
 ; GCN-O1-NEXT:        Canonicalize Freeze Instructions in Loops
 ; GCN-O1-NEXT:        Induction Variable Users
 ; GCN-O1-NEXT:        Loop Strength Reduction
-; GCN-O1-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT:      Function Alias Analysis Results
-; GCN-O1-NEXT:      Merge contiguous icmps into a memcmp
-; GCN-O1-NEXT:      Natural Loop Information
-; GCN-O1-NEXT:      Lazy Branch Probability Analysis
-; GCN-O1-NEXT:      Lazy Block Frequency Analysis
-; GCN-O1-NEXT:      Expand memcmp() to load/stores
 ; GCN-O1-NEXT:      Lower constant intrinsics
 ; GCN-O1-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O1-NEXT:      Natural Loop Information
@@ -484,13 +477,6 @@
 ; GCN-O1-OPTS-NEXT:        Canonicalize Freeze Instructions in Loops
 ; GCN-O1-OPTS-NEXT:        Induction Variable Users
 ; GCN-O1-OPTS-NEXT:        Loop Strength Reduction
-; GCN-O1-OPTS-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O1-OPTS-NEXT:      Function Alias Analysis Results
-; GCN-O1-OPTS-NEXT:      Merge contiguous icmps into a memcmp
-; GCN-O1-OPTS-NEXT:      Natural Loop Information
-; GCN-O1-OPTS-NEXT:      Lazy Branch Probability Analysis
-; GCN-O1-OPTS-NEXT:      Lazy Block Frequency Analysis
-; GCN-O1-OPTS-NEXT:      Expand memcmp() to load/stores
 ; GCN-O1-OPTS-NEXT:      Lower constant intrinsics
 ; GCN-O1-OPTS-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
@@ -784,13 +770,6 @@
 ; GCN-O2-NEXT:        Canonicalize Freeze Instructions in Loops
 ; GCN-O2-NEXT:        Induction Variable Users
 ; GCN-O2-NEXT:        Loop Strength Reduction
-; GCN-O2-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O2-NEXT:      Function Alias Analysis Results
-; GCN-O2-NEXT:      Merge contiguous icmps into a memcmp
-; GCN-O2-NEXT:      Natural Loop Information
-; GCN-O2-NEXT:      Lazy Branch Probability Analysis
-; GCN-O2-NEXT:      Lazy Block Frequency Analysis
-; GCN-O2-NEXT:      Expand memcmp() to load/stores
 ; GCN-O2-NEXT:      Lower constant intrinsics
 ; GCN-O2-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O2-NEXT:      Natural Loop Information
@@ -1092,13 +1071,6 @@
 ; GCN-O3-NEXT:        Canonicalize Freeze Instructions in Loops
 ; GCN-O3-NEXT:        Induction Variable Users
 ; GCN-O3-NEXT:        Loop Strength Reduction
-; GCN-O3-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O3-NEXT:      Function Alias Analysis Results
-; GCN-O3-NEXT:      Merge contiguous icmps into a memcmp
-; GCN-O3-NEXT:      Natural Loop Information
-; GCN-O3-NEXT:      Lazy Branch Probability Analysis
-; GCN-O3-NEXT:      Lazy Block Frequency Analysis
-; GCN-O3-NEXT:      Expand memcmp() to load/stores
 ; GCN-O3-NEXT:      Lower constant intrinsics
 ; GCN-O3-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O3-NEXT:      Natural Loop Information
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 5e565970fc3a86..f2bef2c7e46acc 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -21,13 +21,6 @@
 ; CHECK-NEXT:        Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:        Induction Variable Users
 ; CHECK-NEXT:        Loop Strength Reduction
-; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:      Function Alias Analysis Results
-; CHECK-NEXT:      Merge contiguous icmps into a memcmp
-; CHECK-NEXT:      Natural Loop Information
-; CHECK-NEXT:      Lazy Branch Probability Analysis
-; CHECK-NEXT:      Lazy Block Frequency Analysis
-; CHECK-NEXT:      Expand memcmp() to load/stores
 ; CHECK-NEXT:      Lower Garbage Collection Instructions
 ; CHECK-NEXT:      Shadow Stack GC Lowering
 ; CHECK-NEXT:      Lower constant intrinsics
diff --git a/llvm/test/CodeGen/BPF/memcmp.ll b/llvm/test/CodeGen/BPF/memcmp.ll
deleted file mode 100644
index 7ed8dc1e736f4c..00000000000000
--- a/llvm/test/CodeGen/BPF/memcmp.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: llc -march=bpfel < %s | FileCheck %s
-; RUN: llc -march=bpfel -mcpu=v3 < %s | FileCheck %s
-;
-; Source code:
-;   /* set aligned 4 to minimize the number of loads */
-;   struct build_id {
-;     unsigned char id[20];
-;   } __attribute__((aligned(4)));
-;
-;   /* try to compute a local build_id */
-;   void bar1(ptr);
-;
-;   /* the global build_id to compare */
-;   struct build_id id2;
-;
-;   int foo()
-;   {
-;     struct build_id id1;
-;
-;     bar1(&id1);
-;     return __builtin_memcmp(&id1, &id2, sizeof(id1)) == 0;
-;   }
-; Compilation flags:
-;   clang -target bpf -S -O2 t.c -emit-llvm
-
-
-%struct.build_id = type { [20 x i8] }
-
- at id2 = dso_local global %struct.build_id zeroinitializer, align 4
-
-; Function Attrs: nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 {
-entry:
-  %id11 = alloca [20 x i8], align 4
-  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %id11) #4
-  call void @bar1(ptr noundef nonnull %id11) #4
-  %call = call i32 @memcmp(ptr noundef nonnull dereferenceable(20) %id11, ptr noundef nonnull dereferenceable(20) @id2, i64 noundef 20) #4
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %id11) #4
-  ret i32 %conv
-}
-
-; CHECK-DAG:   *(u32 *)(r1 + 0)
-; CHECK-DAG:   *(u32 *)(r1 + 4)
-; CHECK-DAG:   *(u32 *)(r10 - 16)
-; CHECK-DAG:   *(u32 *)(r10 - 20)
-; CHECK-DAG:   *(u32 *)(r10 - 8)
-; CHECK-DAG:   *(u32 *)(r10 - 12)
-; CHECK-DAG:   *(u32 *)(r1 + 8)
-; CHECK-DAG:   *(u32 *)(r1 + 12)
-; CHECK-DAG:   *(u32 *)(r2 + 16)
-; CHECK-DAG:   *(u32 *)(r10 - 4)
-
-; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
-
-declare dso_local void @bar1(ptr noundef) local_unnamed_addr #2
-
-; Function Attrs: argmemonly mustprogress nofree nounwind readonly willreturn
-declare dso_local i32 @memcmp(ptr nocapture noundef, ptr nocapture noundef, i64 noundef) local_unnamed_addr #3
-
-; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn }
-attributes #2 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-attributes #3 = { argmemonly mustprogress nofree nounwind readonly willreturn "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-attributes #4 = { nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"frame-pointer", i32 2}
-!2 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git dea65874b2505f8f5e8e51fd8cad6908feb375ec)"}
diff --git a/llvm/test/CodeGen/Generic/llc-start-stop.ll b/llvm/test/CodeGen/Generic/llc-start-stop.ll
index b02472473a00cb..9ada245835981b 100644
--- a/llvm/test/CodeGen/Generic/llc-start-stop.ll
+++ b/llvm/test/CodeGen/Generic/llc-start-stop.ll
@@ -19,15 +19,15 @@
 ; STOP-BEFORE-NOT: Loop Strength Reduction
 
 ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER
-; START-AFTER: -aa -mergeicmps
+; START-AFTER: -gc-lowering
 ; START-AFTER: FunctionPass Manager
-; START-AFTER-NEXT: Dominator Tree Construction
+; START-AFTER-NEXT: Lower Garbage Collection Instructions
 
 ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE
 ; START-BEFORE: -machine-branch-prob -regalloc-evict -regalloc-priority -domtree
 ; START-BEFORE: FunctionPass Manager
 ; START-BEFORE: Loop Strength Reduction
-; START-BEFORE-NEXT: Basic Alias Analysis (stateless AA impl)
+; START-BEFORE-NEXT: Lower Garbage Collection Instructions
 
 ; RUN: not --crash llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE
 ; RUN: not --crash llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index 3134d940545e80..696d8c8be017cb 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -23,8 +23,8 @@
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
-; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
@@ -44,13 +44,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
diff --git a/llvm/test/CodeGen/M68k/pipeline.ll b/llvm/test/CodeGen/M68k/pipeline.ll
index dfaa149b7a4744..ad053cf4d61a07 100644
--- a/llvm/test/CodeGen/M68k/pipeline.ll
+++ b/llvm/test/CodeGen/M68k/pipeline.ll
@@ -15,13 +15,6 @@
 ; CHECK-NEXT:        Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:        Induction Variable Users
 ; CHECK-NEXT:        Loop Strength Reduction
-; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:      Function Alias Analysis Results
-; CHECK-NEXT:      Merge contiguous icmps into a memcmp
-; CHECK-NEXT:      Natural Loop Information
-; CHECK-NEXT:      Lazy Branch Probability Analysis
-; CHECK-NEXT:      Lazy Block Frequency Analysis
-; CHECK-NEXT:      Expand memcmp() to load/stores
 ; CHECK-NEXT:      Lower Garbage Collection Instructions
 ; CHECK-NEXT:      Shadow Stack GC Lowering
 ; CHECK-NEXT:      Lower constant intrinsics
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 6ce4416211cc4d..1fdb4802eff036 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -11,8 +11,8 @@
 ; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
-; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
@@ -45,13 +45,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
diff --git a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
deleted file mode 100644
index 1da40d46aa7730..00000000000000
--- a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
+++ /dev/null
@@ -1,168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-n32:64"
-target triple = "powerpc64le-unknown-linux-gnu"
-
- at zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
- at zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
- at zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
- at zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
- at zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
- at zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
- at zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
- at zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
-
-declare signext i32 @memcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #1
-
-; Check 4 bytes - requires 1 load for each param.
-define signext i32 @zeroEqualityTest02(ptr %x, ptr %y) {
-; CHECK-LABEL: zeroEqualityTest02:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lwz 3, 0(3)
-; CHECK-NEXT:    lwz 4, 0(4)
-; CHECK-NEXT:    xor 3, 3, 4
-; CHECK-NEXT:    cntlzw 3, 3
-; CHECK-NEXT:    srwi 3, 3, 5
-; CHECK-NEXT:    xori 3, 3, 1
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 4)
-  %not.cmp = icmp ne i32 %call, 0
-  %. = zext i1 %not.cmp to i32
-  ret i32 %.
-}
-
-; Check 16 bytes - requires 2 loads for each param (or use vectors?).
-define signext i32 @zeroEqualityTest01(ptr %x, ptr %y) {
-; CHECK-LABEL: zeroEqualityTest01:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld 5, 0(3)
-; CHECK-NEXT:    ld 6, 0(4)
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    bne 0, .LBB1_2
-; CHECK-NEXT:  # %bb.1: # %loadbb1
-; CHECK-NEXT:    ld 5, 8(3)
-; CHECK-NEXT:    ld 4, 8(4)
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    cmpld 5, 4
-; CHECK-NEXT:    beqlr 0
-; CHECK-NEXT:  .LBB1_2: # %res_block
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 16)
-  %not.tobool = icmp ne i32 %call, 0
-  %. = zext i1 %not.tobool to i32
-  ret i32 %.
-}
-
-; Check 7 bytes - requires 3 loads for each param.
-define signext i32 @zeroEqualityTest03(ptr %x, ptr %y) {
-; CHECK-LABEL: zeroEqualityTest03:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lwz 5, 0(3)
-; CHECK-NEXT:    lwz 6, 0(4)
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    bne 0, .LBB2_3
-; CHECK-NEXT:  # %bb.1: # %loadbb1
-; CHECK-NEXT:    lhz 5, 4(3)
-; CHECK-NEXT:    lhz 6, 4(4)
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    bne 0, .LBB2_3
-; CHECK-NEXT:  # %bb.2: # %loadbb2
-; CHECK-NEXT:    lbz 5, 6(3)
-; CHECK-NEXT:    lbz 4, 6(4)
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    cmplw 5, 4
-; CHECK-NEXT:    beqlr 0
-; CHECK-NEXT:  .LBB2_3: # %res_block
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 7)
-  %not.lnot = icmp ne i32 %call, 0
-  %cond = zext i1 %not.lnot to i32
-  ret i32 %cond
-}
-
-; Validate with > 0
-define signext i32 @zeroEqualityTest04() {
-; CHECK-LABEL: zeroEqualityTest04:
-; CHECK:       # %bb.0: # %loadbb
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest02.buffer1, ptr @zeroEqualityTest02.buffer2, i64 16)
-  %not.cmp = icmp slt i32 %call, 1
-  %. = zext i1 %not.cmp to i32
-  ret i32 %.
-}
-
-; Validate with < 0
-define signext i32 @zeroEqualityTest05() {
-; CHECK-LABEL: zeroEqualityTest05:
-; CHECK:       # %bb.0: # %loadbb
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest03.buffer1, ptr @zeroEqualityTest03.buffer2, i64 16)
-  %call.lobit = lshr i32 %call, 31
-  %call.lobit.not = xor i32 %call.lobit, 1
-  ret i32 %call.lobit.not
-}
-
-; Validate with memcmp()?:
-define signext i32 @equalityFoldTwoConstants() {
-; CHECK-LABEL: equalityFoldTwoConstants:
-; CHECK:       # %bb.0: # %loadbb
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr @zeroEqualityTest04.buffer2, i64 16)
-  %not.tobool = icmp eq i32 %call, 0
-  %cond = zext i1 %not.tobool to i32
-  ret i32 %cond
-}
-
-define signext i32 @equalityFoldOneConstant(ptr %X) {
-; CHECK-LABEL: equalityFoldOneConstant:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li 5, 1
-; CHECK-NEXT:    ld 4, 0(3)
-; CHECK-NEXT:    rldic 5, 5, 32, 31
-; CHECK-NEXT:    cmpld 4, 5
-; CHECK-NEXT:    bne 0, .LBB6_2
-; CHECK-NEXT:  # %bb.1: # %loadbb1
-; CHECK-NEXT:    lis 5, -32768
-; CHECK-NEXT:    ld 4, 8(3)
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    ori 5, 5, 1
-; CHECK-NEXT:    rldic 5, 5, 1, 30
-; CHECK-NEXT:    cmpld 4, 5
-; CHECK-NEXT:    beq 0, .LBB6_3
-; CHECK-NEXT:  .LBB6_2: # %res_block
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:  .LBB6_3: # %endblock
-; CHECK-NEXT:    cntlzw 3, 3
-; CHECK-NEXT:    srwi 3, 3, 5
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr %X, i64 16)
-  %not.tobool = icmp eq i32 %call, 0
-  %cond = zext i1 %not.tobool to i32
-  ret i32 %cond
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_eq_nobuiltin_attr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    mflr 0
-; CHECK-NEXT:    stdu 1, -32(1)
-; CHECK-NEXT:    li 5, 2
-; CHECK-NEXT:    std 0, 48(1)
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    cntlzw 3, 3
-; CHECK-NEXT:    rlwinm 3, 3, 27, 31, 31
-; CHECK-NEXT:    addi 1, 1, 32
-; CHECK-NEXT:    ld 0, 16(1)
-; CHECK-NEXT:    mtlr 0
-; CHECK-NEXT:    blr
-  %m = tail call signext i32 @memcmp(ptr %X, ptr %Y, i64 2) nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
diff --git a/llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll b/llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll
deleted file mode 100644
index 29910646c89371..00000000000000
--- a/llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux  < %s | FileCheck %s -check-prefix=PPC64LE
-
-; This tests interaction between MergeICmp and expand-memcmp.
-
-%"struct.std::pair" = type { i32, i32 }
-
-define zeroext i1 @opeq1(
-; PPC64LE-LABEL: opeq1:
-; PPC64LE:       # %bb.0: # %"entry+land.rhs.i"
-; PPC64LE-NEXT:    ld 3, 0(3)
-; PPC64LE-NEXT:    ld 4, 0(4)
-; PPC64LE-NEXT:    cmpd 3, 4
-; PPC64LE-NEXT:    li 3, 0
-; PPC64LE-NEXT:    li 4, 1
-; PPC64LE-NEXT:    iseleq 3, 4, 3
-; PPC64LE-NEXT:    blr
-  ptr nocapture readonly dereferenceable(8) %a,
-  ptr nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
-entry:
-  %0 = load i32, ptr %a, align 4
-  %1 = load i32, ptr %b, align 4
-  %cmp.i = icmp eq i32 %0, %1
-  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
-
-land.rhs.i:
-  %second.i = getelementptr inbounds %"struct.std::pair", ptr %a, i64 0, i32 1
-  %2 = load i32, ptr %second.i, align 4
-  %second2.i = getelementptr inbounds %"struct.std::pair", ptr %b, i64 0, i32 1
-  %3 = load i32, ptr %second2.i, align 4
-  %cmp3.i = icmp eq i32 %2, %3
-  br label %opeq1.exit
-
-opeq1.exit:
-  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
-  ret i1 %4
-}
-
-
diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll
deleted file mode 100644
index 0634534b9c9df1..00000000000000
--- a/llvm/test/CodeGen/PowerPC/memcmp.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux  < %s | FileCheck %s -check-prefix=CHECK
-
-define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
-; CHECK-LABEL: memcmp8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ldbrx 3, 0, 3
-; CHECK-NEXT:    ldbrx 4, 0, 4
-; CHECK-NEXT:    subc 5, 4, 3
-; CHECK-NEXT:    subfe 5, 4, 4
-; CHECK-NEXT:    subc 4, 3, 4
-; CHECK-NEXT:    subfe 3, 3, 3
-; CHECK-NEXT:    neg 5, 5
-; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    sub 3, 5, 3
-; CHECK-NEXT:    extsw 3, 3
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8)
-  ret i32 %call
-}
-
-define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
-; CHECK-LABEL: memcmp4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lwbrx 3, 0, 3
-; CHECK-NEXT:    lwbrx 4, 0, 4
-; CHECK-NEXT:    sub 5, 4, 3
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    rldicl 3, 3, 1, 63
-; CHECK-NEXT:    sub 3, 5, 3
-; CHECK-NEXT:    extsw 3, 3
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
-  ret i32 %call
-}
-
-define signext i32 @memcmp2(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
-; CHECK-LABEL: memcmp2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lhbrx 3, 0, 3
-; CHECK-NEXT:    lhbrx 4, 0, 4
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:    extsw 3, 3
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 2)
-  ret i32 %call
-}
-
-define signext i32 @memcmp1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
-; CHECK-LABEL: memcmp1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    lbz 4, 0(4)
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:    extsw 3, 3
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 1) #2
-  ret i32 %call
-}
-
-declare signext i32 @memcmp(ptr, ptr, i64)
diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
deleted file mode 100644
index 0a8bec7dc0e3f1..00000000000000
--- a/llvm/test/CodeGen/PowerPC/memcmpIR.ll
+++ /dev/null
@@ -1,178 +0,0 @@
-; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s
-; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE
-
-define signext i32 @test1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
-entry:
-  ; CHECK-LABEL: @test1(
-  ; CHECK-LABEL: res_block:{{.*}}
-  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
-  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
-  ; CHECK-NEXT: br label %endblock
-
-  ; CHECK-LABEL: loadbb:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
-
-  ; CHECK-LABEL: loadbb1:{{.*}}
-  ; CHECK-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
-  ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
-  ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]]
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]]
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %endblock, label %res_block
-
-  ; CHECK-BE-LABEL: @test1(
-  ; CHECK-BE-LABEL: res_block:{{.*}}
-  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
-  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
-  ; CHECK-BE-NEXT: br label %endblock
-
-  ; CHECK-BE-LABEL: loadbb:{{.*}}
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
-
-  ; CHECK-BE-LABEL: loadbb1:{{.*}}
-  ; CHECK-BE-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
-  ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
-  ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]]
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]]
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %endblock, label %res_block
-
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 16)
-  ret i32 %call
-}
-
-declare signext i32 @memcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #1
-
-define signext i32 @test2(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
-  ; CHECK-LABEL: @test2(
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
-  ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
-  ; CHECK-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
-  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
-  ; CHECK-NEXT: ret i32 [[SUB]]
-
-  ; CHECK-BE-LABEL: @test2(
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
-  ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
-  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
-  ; CHECK-BE-NEXT: ret i32 [[SUB]]
-
-entry:
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
-  ret i32 %call
-}
-
-define signext i32 @test3(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
-  ; CHECK-LABEL: res_block:{{.*}}
-  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
-  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
-  ; CHECK-NEXT: br label %endblock
-
-  ; CHECK-LABEL: loadbb:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
-
-  ; CHECK-LABEL: loadbb1:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
-  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
-  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %loadbb2, label %res_block
-
-  ; CHECK-LABEL: loadbb2:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i16, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]])
-  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64
-  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %loadbb3, label %res_block
-
-  ; CHECK-LABEL: loadbb3:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i8, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, ptr
-  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
-  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
-  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-NEXT:  br label %endblock
-
-  ; CHECK-BE-LABEL: res_block:{{.*}}
-  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
-  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
-  ; CHECK-BE-NEXT: br label %endblock
-
-  ; CHECK-BE-LABEL: loadbb:{{.*}}
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
-
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
-  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb2, label %res_block
-
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, ptr
-  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
-  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb3, label %res_block
-
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, ptr
-  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
-  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
-  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-BE-NEXT:  br label %endblock
-
-entry:
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 15)
-  ret i32 %call
-}
-  ; CHECK: call = tail call signext i32 @memcmp
-  ; CHECK-BE: call = tail call signext i32 @memcmp
-define signext i32 @test4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
-
-entry:
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 65)
-  ret i32 %call
-}
-
-define signext i32 @test5(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2, i32 signext %SIZE)  {
-  ; CHECK: call = tail call signext i32 @memcmp
-  ; CHECK-BE: call = tail call signext i32 @memcmp
-entry:
-  %conv = sext i32 %SIZE to i64
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 %conv)
-  ret i32 %call
-}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index e7db8ef9d5aff3..8b07c7015dcceb 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -45,13 +45,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
@@ -193,7 +186,7 @@
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Stack Frame Layout Analysis
 ; CHECK-NEXT:       RISC-V Zcmp move merging pass
-; CHECK-NEXT:       RISC-V Zcmp Push/Pop optimization pass 
+; CHECK-NEXT:       RISC-V Zcmp Push/Pop optimization pass
 ; CHECK-NEXT:       RISC-V pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Unpack machine instruction bundles
diff --git a/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll b/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
deleted file mode 100644
index c16e2adb7a0783..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown               | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown             | FileCheck %s --check-prefix=X64
-
-; This tests interaction between MergeICmp and ExpandMemCmp.
-
-%"struct.std::pair" = type { i32, i32 }
-
-define zeroext i1 @opeq1(
-; X86-LABEL: opeq1:
-; X86:       # %bb.0: # %"entry+land.rhs.i"
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
-; X64-LABEL: opeq1:
-; X64:       # %bb.0: # %"entry+land.rhs.i"
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  ptr nocapture readonly dereferenceable(8) %a,
-  ptr nocapture readonly dereferenceable(8) %b) local_unnamed_addr nofree nosync {
-entry:
-  %0 = load i32, ptr %a, align 4
-  %1 = load i32, ptr %b, align 4
-  %cmp.i = icmp eq i32 %0, %1
-  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
-
-land.rhs.i:
-  %second.i = getelementptr inbounds %"struct.std::pair", ptr %a, i64 0, i32 1
-  %2 = load i32, ptr %second.i, align 4
-  %second2.i = getelementptr inbounds %"struct.std::pair", ptr %b, i64 0, i32 1
-  %3 = load i32, ptr %second2.i, align 4
-  %cmp3.i = icmp eq i32 %2, %3
-  br label %opeq1.exit
-
-opeq1.exit:
-  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
-  ret i1 %4
-}
-
-
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
deleted file mode 100644
index ae1320f8b0868b..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
+++ /dev/null
@@ -1,445 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $4
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind minsize {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
-; X86-LABEL: length24_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length24_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
-; X86-LABEL: length32_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length32_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize.ll b/llvm/test/CodeGen/X86/memcmp-minsize.ll
deleted file mode 100644
index 544d1c49f26b99..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-minsize.ll
+++ /dev/null
@@ -1,433 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $2
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $2
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $3
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $3
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $4
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $5
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $5
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $8
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $12
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $12
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
-;
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $16
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind minsize {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $24
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
-; X64-LABEL: length24_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $24
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length24_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $24
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $32
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    pushq $32
-; X64-SSE2-NEXT:    popq %rdx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind minsize {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    pushq $32
-; X64-SSE2-NEXT:    popq %rdx
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $64
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
-; X64-LABEL: length64_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $64
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length64_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $64
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
deleted file mode 100644
index 0253d131226083..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ /dev/null
@@ -1,2911 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=cmov     | FileCheck %s --check-prefixes=X86,X86-NOSSE
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse     | FileCheck %s --check-prefixes=X86,X86-SSE1
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse2    | FileCheck %s --check-prefixes=X86,X86-SSE2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1  | FileCheck %s --check-prefixes=X86,X86-SSE41
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movb $1, %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB9_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    setb %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB16_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB16_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB18_2
-; X86-NEXT:  .LBB18_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB18_2: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB19_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB19_3
-; X86-NEXT:  .LBB19_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB19_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 3(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 3(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB21_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB21_3
-; X86-NEXT:  .LBB21_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB21_3: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB22_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB22_3
-; X86-NEXT:  .LBB22_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB22_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length9_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movzbl 8(%ecx), %ecx
-; X86-NEXT:    xorb 8(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length10_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movzwl 8(%ecx), %ecx
-; X86-NEXT:    xorw 8(%eax), %cx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length11_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 7(%ecx), %ecx
-; X86-NEXT:    xorl 7(%eax), %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %ecx
-; X86-NEXT:    xorl 8(%eax), %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB29_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB29_3
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB29_4
-; X86-NEXT:  .LBB29_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB29_4: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length13_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    xorl (%ecx), %esi
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    xorl 8(%ecx), %esi
-; X86-NEXT:    movzbl 12(%edx), %edx
-; X86-NEXT:    xorb 12(%ecx), %dl
-; X86-NEXT:    movzbl %dl, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length14_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    xorl (%ecx), %esi
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    xorl 8(%ecx), %esi
-; X86-NEXT:    movzwl 12(%edx), %edx
-; X86-NEXT:    xorw 12(%ecx), %dx
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    xorl (%ecx), %esi
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    xorl 8(%ecx), %esi
-; X86-NEXT:    movl 11(%edx), %edx
-; X86-NEXT:    xorl 11(%ecx), %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
-; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB33_5
-; X86-NEXT:  .LBB33_4: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB33_5: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl (%edx), %esi
-; X86-NOSSE-NEXT:    movl 4(%edx), %eax
-; X86-NOSSE-NEXT:    xorl (%ecx), %esi
-; X86-NOSSE-NEXT:    xorl 4(%ecx), %eax
-; X86-NOSSE-NEXT:    orl %esi, %eax
-; X86-NOSSE-NEXT:    movl 8(%edx), %esi
-; X86-NOSSE-NEXT:    xorl 8(%ecx), %esi
-; X86-NOSSE-NEXT:    movl 12(%edx), %edx
-; X86-NOSSE-NEXT:    xorl 12(%ecx), %edx
-; X86-NOSSE-NEXT:    orl %esi, %edx
-; X86-NOSSE-NEXT:    orl %eax, %edx
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE1-NEXT:    movl (%edx), %esi
-; X86-SSE1-NEXT:    movl 4(%edx), %eax
-; X86-SSE1-NEXT:    xorl (%ecx), %esi
-; X86-SSE1-NEXT:    xorl 4(%ecx), %eax
-; X86-SSE1-NEXT:    orl %esi, %eax
-; X86-SSE1-NEXT:    movl 8(%edx), %esi
-; X86-SSE1-NEXT:    xorl 8(%ecx), %esi
-; X86-SSE1-NEXT:    movl 12(%edx), %edx
-; X86-SSE1-NEXT:    xorl 12(%ecx), %edx
-; X86-SSE1-NEXT:    orl %esi, %edx
-; X86-SSE1-NEXT:    orl %eax, %edx
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length16_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
-; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB35_5
-; X86-NEXT:  .LBB35_4: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB35_5: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length16_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl 4(%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %eax
-; X86-NEXT:    movl 8(%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
-; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %eax
-; X86-NEXT:    movl 12(%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    je .LBB36_5
-; X86-NEXT:  .LBB36_4: # %res_block
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    orl $1, %edx
-; X86-NEXT:  .LBB36_5: # %endblock
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    setg %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NOSSE-NEXT:    xorl (%eax), %ecx
-; X86-NOSSE-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NOSSE-NEXT:    xorl 4(%eax), %edx
-; X86-NOSSE-NEXT:    orl %ecx, %edx
-; X86-NOSSE-NEXT:    movl $825243960, %ecx # imm = 0x31303938
-; X86-NOSSE-NEXT:    xorl 8(%eax), %ecx
-; X86-NOSSE-NEXT:    movl $892613426, %esi # imm = 0x35343332
-; X86-NOSSE-NEXT:    xorl 12(%eax), %esi
-; X86-NOSSE-NEXT:    orl %ecx, %esi
-; X86-NOSSE-NEXT:    orl %edx, %esi
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-SSE1-NEXT:    xorl (%eax), %ecx
-; X86-SSE1-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-SSE1-NEXT:    xorl 4(%eax), %edx
-; X86-SSE1-NEXT:    orl %ecx, %edx
-; X86-SSE1-NEXT:    movl $825243960, %ecx # imm = 0x31303938
-; X86-SSE1-NEXT:    xorl 8(%eax), %ecx
-; X86-SSE1-NEXT:    movl $892613426, %esi # imm = 0x35343332
-; X86-SSE1-NEXT:    xorl 12(%eax), %esi
-; X86-SSE1-NEXT:    orl %ecx, %esi
-; X86-SSE1-NEXT:    orl %edx, %esi
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length24_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length24_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length31:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length31_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length31_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length31_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length31_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length31_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length32_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length32_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length32_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length48:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length48_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length48_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length48_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length48_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    por %xmm0, %xmm2
-; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length48_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length48_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length48_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length48_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    por %xmm0, %xmm2
-; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length48_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length48_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length48_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length48_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm0, %xmm2
-; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length63:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length63_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $63
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length63_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $63
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length63_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length63_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length63_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $63
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length63_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $63
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length63_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm3, %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length63_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm3, %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length64_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $64
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length64_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $64
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length64_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length64_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length64_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $64
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length64_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $64
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length64_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm3, %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length64_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm3, %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length96:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; X86-LABEL: length96_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length127:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; X86-LABEL: length127_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; X86-LABEL: length128_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length192:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; X86-LABEL: length192_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length255:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; X86-LABEL: length255_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length256:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; X86-LABEL: length256_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length384:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; X86-LABEL: length384_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length511:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; X86-LABEL: length511_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; X86-LABEL: length512_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks that we do not do stupid things with huge sizes.
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: huge_length:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: huge_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks non-constant sizes.
-define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
-; X86-LABEL: nonconst_length:
-; X86:       # %bb.0:
-; X86-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
-; X86-LABEL: nonconst_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
deleted file mode 100644
index 6eb02bfc1fd0c3..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ /dev/null
@@ -1,4006 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown               | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx    | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx2   | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512BW
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512F
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX512F
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movb $1, %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    movzwl (%rsi), %edx
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    rolw $8, %dx
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    jne .LBB9_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    xorw (%rsi), %ax
-; X64-NEXT:    movzbl 2(%rdi), %ecx
-; X64-NEXT:    xorb 2(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orw %ax, %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB16_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB16_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    movzbl 4(%rdi), %ecx
-; X64-NEXT:    xorb 4(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB18_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB19_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB19_3
-; X64-NEXT:  .LBB19_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB19_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    xorl 3(%rsi), %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB21_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB21_3
-; X64-NEXT:  .LBB21_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB21_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length9_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movzbl 8(%rdi), %ecx
-; X64-NEXT:    xorb 8(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length10_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movzwl 8(%rdi), %ecx
-; X64-NEXT:    xorw 8(%rsi), %cx
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length11_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 3(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 3(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    xorl 8(%rsi), %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB29_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB29_3
-; X64-NEXT:  .LBB29_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB29_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length13_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 5(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 5(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length14_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 6(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 6(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 7(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB33_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB33_3
-; X64-NEXT:  .LBB33_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB33_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length16_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length16_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length16_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB35_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB35_3
-; X64-NEXT:  .LBB35_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB35_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length16_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB36_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB36_3
-; X64-NEXT:  .LBB36_2: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB36_3: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length16_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length16_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB38_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB38_3
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB38_4
-; X64-NEXT:  .LBB38_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB38_4: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length24_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    por %xmm1, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length24_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length24_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB40_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB40_3
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB40_4
-; X64-NEXT:  .LBB40_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB40_4: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length24_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB41_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB41_3
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rax
-; X64-NEXT:    movq 16(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB41_4
-; X64-NEXT:  .LBB41_3: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB41_4: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length24_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length24_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length31:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB43_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB43_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB43_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 23(%rdi), %rcx
-; X64-NEXT:    movq 23(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB43_5
-; X64-NEXT:  .LBB43_4: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB43_5: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length31_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length31_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB45_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB45_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB45_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 23(%rdi), %rcx
-; X64-NEXT:    movq 23(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB45_5
-; X64-NEXT:  .LBB45_4: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB45_5: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length31_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB46_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB46_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rax
-; X64-NEXT:    movq 16(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB46_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 23(%rdi), %rax
-; X64-NEXT:    movq 23(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB46_5
-; X64-NEXT:  .LBB46_4: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB46_5: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length31_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length31_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [943142453,842084409,909456435,809056311]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB49_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB49_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB49_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 24(%rdi), %rcx
-; X64-NEXT:    movq 24(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB49_5
-; X64-NEXT:  .LBB49_4: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB49_5: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length32_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length32_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB51_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB51_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB51_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 24(%rdi), %rcx
-; X64-NEXT:    movq 24(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB51_5
-; X64-NEXT:  .LBB51_4: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB51_5: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length32_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB52_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB52_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rax
-; X64-NEXT:    movq 16(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB52_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 24(%rdi), %rax
-; X64-NEXT:    movq 24(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB52_5
-; X64-NEXT:  .LBB52_4: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB52_5: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length32_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length32_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length32_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length48:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length48_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm3
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm3, %xmm0
-; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length48_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm3
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE41-NEXT:    por %xmm0, %xmm1
-; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length48_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX1-NEXT:    vmovups 32(%rsi), %xmm2
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length48_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX2-NEXT:    vmovdqu 32(%rsi), %xmm2
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length48_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX512-NEXT:    vmovdqu 32(%rsi), %xmm2
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length48_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length48_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length48_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm3
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm3, %xmm0
-; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length48_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm3
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE41-NEXT:    por %xmm0, %xmm1
-; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length48_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
-; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpxor 32(%rsi), %xmm2, %xmm1
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm4
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm5
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm4, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm5, %zmm2, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length48_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT:    pand %xmm0, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length48_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT:    por %xmm0, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length48_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length48_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length48_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length63:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length63_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu 47(%rdi), %xmm3
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm4
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm4
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm4, %xmm0
-; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X64-SSE2-NEXT:    movdqu 47(%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pand %xmm0, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length63_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu 47(%rdi), %xmm3
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm4
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm4
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE41-NEXT:    movdqu 47(%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm2
-; X64-SSE41-NEXT:    por %xmm1, %xmm2
-; X64-SSE41-NEXT:    por %xmm0, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length63_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 31(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length63_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length63_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length63_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm2
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rsi), %ymm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length63_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length63_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length63_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu 47(%rdi), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT:    pand %xmm3, %xmm2
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length63_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu 47(%rdi), %xmm3
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT:    por %xmm3, %xmm2
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length63_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length63_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length63_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length63_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [875770417,943142453,842084409,909456435,809056311,875770417,943142453,842084409]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length64_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu 48(%rdi), %xmm3
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm4
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm4
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm4, %xmm0
-; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X64-SSE2-NEXT:    movdqu 48(%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pand %xmm0, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length64_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu 48(%rdi), %xmm3
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm4
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm4
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE41-NEXT:    movdqu 48(%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm2
-; X64-SSE41-NEXT:    por %xmm1, %xmm2
-; X64-SSE41-NEXT:    por %xmm0, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length64_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-AVX512-NEXT:    kortestw %k0, %k0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length64_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm3
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length64_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length64_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length64_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length64_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu 48(%rdi), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT:    pand %xmm3, %xmm2
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length64_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu 48(%rdi), %xmm3
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT:    por %xmm3, %xmm2
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length64_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-AVX512-NEXT:    kortestw %k0, %k0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length64_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length64_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length96:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length96_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $96, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length96_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length96_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length96_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb %zmm2, %zmm1, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length96_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512F-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length96_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm4
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rsi), %ymm5
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm5, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length96_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length96_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length96_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length96_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $96, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length96_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length96_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length96_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length96_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length96_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [926299444,825243960,892613426,959985462,858927408,926299444,825243960,892613426]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm0, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length96_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length127:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length127_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $127, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length127_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vmovups 95(%rdi), %ymm3
-; X64-AVX1-NEXT:    vxorps 95(%rsi), %ymm3, %ymm3
-; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length127_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vmovdqu 95(%rdi), %ymm3
-; X64-AVX2-NEXT:    vpxor 95(%rsi), %ymm3, %ymm3
-; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length127_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb 63(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length127_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd 63(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length127_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 95(%rdi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm4
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm5
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rsi), %ymm6
-; X64-MIC-AVX2-NEXT:    vmovdqu 95(%rsi), %ymm7
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm7, %zmm3, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm6, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm5, %zmm1, %k1
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm0, %k2
-; X64-MIC-AVX2-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length127_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 63(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length127_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length127_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length127_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $127, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length127_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vmovups 95(%rdi), %ymm3
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length127_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vmovdqu 95(%rdi), %ymm3
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length127_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+63(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length127_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+63(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length127_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 95(%rdi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [943142453,842084409,909456435,809056311,875770417,943142453,842084409,909456435]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm3, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [926299444,825243960,892613426,959985462,858927408,926299444,825243960,892613426]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k2
-; X64-MIC-AVX2-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length127_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+63(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length128:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length128_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $128, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length128_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vmovups 96(%rdi), %ymm3
-; X64-AVX1-NEXT:    vxorps 96(%rsi), %ymm3, %ymm3
-; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length128_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vmovdqu 96(%rdi), %ymm3
-; X64-AVX2-NEXT:    vpxor 96(%rsi), %ymm3, %ymm3
-; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length128_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length128_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length128_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 96(%rdi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm4
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm5
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rsi), %ymm6
-; X64-MIC-AVX2-NEXT:    vmovdqu 96(%rsi), %ymm7
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm7, %zmm3, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm6, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm5, %zmm1, %k1
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm0, %k2
-; X64-MIC-AVX2-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length128_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length128_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length128_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length128_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $128, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length128_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vmovups 96(%rdi), %ymm3
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length128_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vmovdqu 96(%rdi), %ymm3
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length128_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length128_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length128_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 96(%rdi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [959985462,858927408,926299444,825243960,892613426,959985462,858927408,926299444]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm3, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [926299444,825243960,892613426,959985462,858927408,926299444,825243960,892613426]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k2
-; X64-MIC-AVX2-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length128_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length192:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length192_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $192, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length192_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $192, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length192_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $192, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length192_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 128(%rsi), %zmm2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length192_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length192_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $192, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length192_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length192_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $192, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length192_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $192, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length192_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $192, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length192_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length192_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length192_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $192, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length192_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length255:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length255_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $255, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length255_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $255, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length255_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $255, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length255_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512BW-NEXT:    vpcmpneqb 191(%rsi), %zmm3, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 128(%rsi), %zmm2, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k1
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k2
-; X64-AVX512BW-NEXT:    korq %k1, %k2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length255_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512F-NEXT:    vpcmpneqd 191(%rsi), %zmm3, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k1
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k2
-; X64-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length255_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $255, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length255_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 191(%rsi), %zmm3, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k2
-; X64-MIC-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length255_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $255, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length255_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $255, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length255_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $255, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length255_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+191(%rip), %zmm3, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k2
-; X64-AVX512BW-NEXT:    korq %k1, %k2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length255_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+191(%rip), %zmm3, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k2
-; X64-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length255_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $255, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length255_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+191(%rip), %zmm3, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k2
-; X64-MIC-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length256:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length256_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $256, %edx # imm = 0x100
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length256_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $256, %edx # imm = 0x100
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length256_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $256, %edx # imm = 0x100
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length256_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512BW-NEXT:    vpcmpneqb 192(%rsi), %zmm3, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 128(%rsi), %zmm2, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k1
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k2
-; X64-AVX512BW-NEXT:    korq %k1, %k2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length256_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512F-NEXT:    vpcmpneqd 192(%rsi), %zmm3, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k1
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k2
-; X64-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length256_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $256, %edx # imm = 0x100
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length256_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 192(%rsi), %zmm3, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k2
-; X64-MIC-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length256_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $256, %edx # imm = 0x100
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length256_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $256, %edx # imm = 0x100
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length256_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $256, %edx # imm = 0x100
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length256_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+192(%rip), %zmm3, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k2
-; X64-AVX512BW-NEXT:    korq %k1, %k2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length256_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+192(%rip), %zmm3, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k2
-; X64-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length256_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $256, %edx # imm = 0x100
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length256_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+192(%rip), %zmm3, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k2
-; X64-MIC-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length384:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; X64-LABEL: length384_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length511:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; X64-LABEL: length511_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length512:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; X64-LABEL: length512_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks that we do not do stupid things with huge sizes.
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: huge_length:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: huge_length_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks non-constant sizes.
-define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
-; X64-LABEL: nonconst_length:
-; X64:       # %bb.0:
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
-; X64-LABEL: nonconst_length_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
deleted file mode 100644
index 762691151f4bd3..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ /dev/null
@@ -1,583 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-declare dso_local i32 @bcmp(ptr, ptr, i32)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind optsize {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_2
-; X86-NEXT:  .LBB4_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB4_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind optsize {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_2
-; X86-NEXT:  .LBB9_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB9_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB11_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB11_3
-; X86-NEXT:  .LBB11_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB11_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind optsize {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind optsize {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind optsize {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind optsize {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind optsize {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: bcmp_length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw (%ecx), %dx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
deleted file mode 100644
index c0c7b98d471cd4..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ /dev/null
@@ -1,596 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-declare dso_local i32 @bcmp(ptr, ptr, i64)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind optsize {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    movzwl (%rsi), %edx
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    rolw $8, %dx
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    jne .LBB4_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    xorw (%rsi), %ax
-; X64-NEXT:    movb 2(%rdi), %cl
-; X64-NEXT:    xorb 2(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orw %ax, %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind optsize {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB9_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    movb 4(%rdi), %cl
-; X64-NEXT:    xorb 4(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind optsize {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    xorl 8(%rsi), %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB15_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB15_3
-; X64-NEXT:  .LBB15_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB15_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB16_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB16_3
-; X64-NEXT:  .LBB16_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB16_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind optsize {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
-; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind optsize {
-; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind optsize {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
-; X64-SSE2-LABEL: length64_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $64, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind optsize {
-; X64-SSE2-LABEL: length64_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    movl $64, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: bcmp_length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw (%rsi), %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
deleted file mode 100644
index cb45fd3ebb9068..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ /dev/null
@@ -1,600 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-declare dso_local i32 @bcmp(ptr, ptr, i32)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_2
-; X86-NEXT:  .LBB4_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB4_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind !prof !14 {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_2
-; X86-NEXT:  .LBB9_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB9_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB11_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB11_3
-; X86-NEXT:  .LBB11_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB11_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: bcmp_length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw (%ecx), %dx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"ProfileSummary", !1}
-!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
-!2 = !{!"ProfileFormat", !"InstrProf"}
-!3 = !{!"TotalCount", i32 10000}
-!4 = !{!"MaxCount", i32 10}
-!5 = !{!"MaxInternalCount", i32 1}
-!6 = !{!"MaxFunctionCount", i32 1000}
-!7 = !{!"NumCounts", i32 3}
-!8 = !{!"NumFunctions", i32 3}
-!9 = !{!"DetailedSummary", !10}
-!10 = !{!11, !12, !13}
-!11 = !{i32 10000, i32 100, i32 1}
-!12 = !{i32 999000, i32 100, i32 1}
-!13 = !{i32 999999, i32 1, i32 2}
-!14 = !{!"function_entry_count", i32 0}
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
deleted file mode 100644
index 720344a22e43b5..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ /dev/null
@@ -1,613 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-declare dso_local i32 @bcmp(ptr, ptr, i64)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    movzwl (%rsi), %edx
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    rolw $8, %dx
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    jne .LBB4_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    xorw (%rsi), %ax
-; X64-NEXT:    movb 2(%rdi), %cl
-; X64-NEXT:    xorb 2(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orw %ax, %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind !prof !14 {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB9_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    movb 4(%rdi), %cl
-; X64-NEXT:    xorb 4(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    xorl 8(%rsi), %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB15_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB15_3
-; X64-NEXT:  .LBB15_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB15_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB16_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB16_3
-; X64-NEXT:  .LBB16_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB16_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
-; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X64-SSE2-LABEL: length64_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $64, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
-; X64-SSE2-LABEL: length64_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    movl $64, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: bcmp_length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw (%rsi), %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"ProfileSummary", !1}
-!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
-!2 = !{!"ProfileFormat", !"InstrProf"}
-!3 = !{!"TotalCount", i64 10000}
-!4 = !{!"MaxCount", i64 10}
-!5 = !{!"MaxInternalCount", i64 1}
-!6 = !{!"MaxFunctionCount", i64 1000}
-!7 = !{!"NumCounts", i64 3}
-!8 = !{!"NumFunctions", i64 3}
-!9 = !{!"DetailedSummary", !10}
-!10 = !{!11, !12, !13}
-!11 = !{i32 10000, i64 100, i32 1}
-!12 = !{i32 999000, i64 100, i32 1}
-!13 = !{i32 999999, i64 1, i32 2}
-!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
deleted file mode 100644
index a63402cea20962..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ /dev/null
@@ -1,2429 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov     | FileCheck %s --check-prefixes=X86,X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse     | FileCheck %s --check-prefixes=X86,X86-SSE1
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2    | FileCheck %s --check-prefixes=X86,X86-SSE2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1  | FileCheck %s --check-prefixes=X86,X86-SSE41
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movb $1, %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i32 @length2_const(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_gt_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB11_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB11_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    setb %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB20_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB20_2
-; X86-NEXT:  .LBB20_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB20_2: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB21_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB21_3
-; X86-NEXT:  .LBB21_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB21_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB22_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB22_3
-; X86-NEXT:  .LBB22_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB22_3: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 3(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 3(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB24_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB24_3
-; X86-NEXT:  .LBB24_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB24_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length9_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $9
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length10_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $10
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length11_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $11
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length13_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $13
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length14_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $14
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15_const(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl $.L.str+1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_gt_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl $.L.str+1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $16
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length16_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length16_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $16
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length24_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length24_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length31:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length31_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length31_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length31_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length31_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length31_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length32_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length32_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length32_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length48:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-LABEL: length48_eq_prefer128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; X86-LABEL: length48_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length63:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; X86-LABEL: length63_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length96:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; X86-LABEL: length96_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length127:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; X86-LABEL: length127_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; X86-LABEL: length128_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length192:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; X86-LABEL: length192_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length255:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; X86-LABEL: length255_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length256:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; X86-LABEL: length256_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length384:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; X86-LABEL: length384_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length511:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; X86-LABEL: length511_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; X86-LABEL: length512_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks that we do not do stupid things with huge sizes.
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: huge_length:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: huge_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks non-constant sizes.
-define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
-; X86-LABEL: nonconst_length:
-; X86:       # %bb.0:
-; X86-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
-; X86-LABEL: nonconst_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
deleted file mode 100644
index f5e7384362a92b..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ /dev/null
@@ -1,3065 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown               | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx    | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2   | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX512F
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movb $1, %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i32 @length2_const(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_gt_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    movzwl (%rsi), %edx
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    rolw $8, %dx
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    jne .LBB11_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB11_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    xorw (%rsi), %ax
-; X64-NEXT:    movzbl 2(%rdi), %ecx
-; X64-NEXT:    xorb 2(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orw %ax, %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB18_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    movzbl 4(%rdi), %ecx
-; X64-NEXT:    xorb 4(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB20_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB20_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB21_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB21_3
-; X64-NEXT:  .LBB21_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB21_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB22_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB22_3
-; X64-NEXT:  .LBB22_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB22_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    xorl 3(%rsi), %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length9_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movzbl 8(%rdi), %ecx
-; X64-NEXT:    xorb 8(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length10_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movzwl 8(%rdi), %ecx
-; X64-NEXT:    xorw 8(%rsi), %cx
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length11_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 3(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 3(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    xorl 8(%rsi), %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB31_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB31_3
-; X64-NEXT:  .LBB31_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB31_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length13_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 5(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 5(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length14_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 6(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 6(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB34_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    movq 7(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB34_3
-; X64-NEXT:  .LBB34_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB34_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB35_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    movq 7(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB35_3
-; X64-NEXT:  .LBB35_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB35_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15_const(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3544952156018063160, %rcx # imm = 0x3132333435363738
-; X64-NEXT:    movq (%rdi), %rdx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    jne .LBB36_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movabsq $4051322327650219061, %rcx # imm = 0x3839303132333435
-; X64-NEXT:    movq 7(%rdi), %rdx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    je .LBB36_3
-; X64-NEXT:  .LBB36_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB36_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 7(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_gt_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3544952156018063160, %rax # imm = 0x3132333435363738
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    jne .LBB38_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movabsq $4051322327650219061, %rax # imm = 0x3839303132333435
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    je .LBB38_3
-; X64-NEXT:  .LBB38_2: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB38_3: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB39_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB39_3
-; X64-NEXT:  .LBB39_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB39_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length16_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length16_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length16_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB41_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB41_3
-; X64-NEXT:  .LBB41_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB41_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length16_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB42_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB42_3
-; X64-NEXT:  .LBB42_2: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB42_3: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length16_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length16_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length24_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    por %xmm1, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length24_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length24_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length24_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length24_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length24_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length31:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $31, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length31_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length31_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $31, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length31_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $31, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length31_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length31_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [943142453,842084409,909456435,809056311]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length32_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length32_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length32_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length32_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length32_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length32_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length48:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length48_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $48, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length48_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX1-NEXT:    vmovups 32(%rsi), %xmm2
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length48_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX2-NEXT:    vmovdqu 32(%rsi), %xmm2
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length48_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX512-NEXT:    vmovdqu 32(%rsi), %xmm2
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length48_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length48_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-LABEL: length48_eq_prefer128:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length48_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $48, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length48_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length48_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length48_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length63:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length63_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $63, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length63_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 31(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length63_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length63_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length63_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm2
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rsi), %ymm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length63_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length63_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length63_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $63, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length63_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length63_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length63_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length63_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [875770417,943142453,842084409,909456435,809056311,875770417,943142453,842084409]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length64_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $64, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length64_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-AVX512-NEXT:    kortestw %k0, %k0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length64_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm3
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length64_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length64_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length64_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length64_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $64, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length64_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-AVX512-NEXT:    kortestw %k0, %k0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length64_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length64_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length96:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length96_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $96, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length96_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $96, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length96_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $96, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length96_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb %zmm2, %zmm1, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length96_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512F-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length96_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $96, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length96_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length96_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length96_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length96_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $96, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length96_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $96, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length96_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $96, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length96_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length96_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length96_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $96, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length96_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length127:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length127_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $127, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length127_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $127, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length127_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $127, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length127_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb 63(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length127_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd 63(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length127_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $127, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length127_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 63(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length127_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length127_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length127_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $127, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length127_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $127, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length127_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $127, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length127_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+63(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length127_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+63(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length127_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $127, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length127_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+63(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length128:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length128_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $128, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length128_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $128, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length128_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $128, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length128_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length128_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length128_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $128, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length128_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length128_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length128_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length128_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $128, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length128_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $128, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length128_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $128, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length128_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length128_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length128_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $128, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length128_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length192:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; X64-LABEL: length192_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length255:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; X64-LABEL: length255_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length256:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; X64-LABEL: length256_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length384:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; X64-LABEL: length384_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length511:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; X64-LABEL: length511_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length512:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; X64-LABEL: length512_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks that we do not do stupid things with huge sizes.
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: huge_length:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: huge_length_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks non-constant sizes.
-define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
-; X64-LABEL: nonconst_length:
-; X64:       # %bb.0:
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
-; X64-LABEL: nonconst_length_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index fb8d2335b34106..e9eddf35f7403c 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -19,8 +19,8 @@
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
-; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
@@ -42,13 +42,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:         Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index ecdb5a5e010d92..ce13b2eb52a7ef 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -142,10 +142,12 @@
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-EP-CGSCC-LATE-NEXT: Running pass: NoOpCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 064362eabbf839..d6f09a85953c14 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -81,10 +81,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 19a44867e434ac..cc3939c5bdcf7b 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -69,10 +69,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index ac80a31d8fd4bc..bf354c91d15f37 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -77,10 +77,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 6486639e07b49c..9c5f9fd281ee7c 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -112,10 +112,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 09f9f0f48baddb..92ab5b6bbc74ad 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -102,17 +102,23 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: TargetIRAnalysis on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running analysis: BasicAA on foo
+; CHECK-O-NEXT: Running analysis: AssumptionAnalysis on foo
+; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis on foo
+; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA on foo
+; CHECK-O-NEXT: Running analysis: TypeBasedAA on foo
+; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
+; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
+; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
+; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
+; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
-; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis
-; CHECK-O-NEXT: Running analysis: AssumptionAnalysis
-; CHECK-O-NEXT: Running analysis: TargetIRAnalysis
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
-; CHECK-O-NEXT: Running analysis: BasicAA
-; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
-; CHECK-O-NEXT: Running analysis: TypeBasedAA
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
@@ -120,10 +126,6 @@
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
-; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
-; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
-; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
 ; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 47bdbfd2d357d4..b565e80ac05e90 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -81,10 +81,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/bcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/bcmp.ll
new file mode 100644
index 00000000000000..18141e72007f7a
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/bcmp.ll
@@ -0,0 +1,751 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp  < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+declare i32 @bcmp(ptr, ptr, i64)
+
+define i1 @bcmp0(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp0(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 0)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp1(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp1(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 1)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp2(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 2)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (and (xor a, b), C1), (and (xor c, d), C2)
+define i1 @bcmp3(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp3(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 3)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp4(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp4(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 4)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (xor a, b), (and (xor c, d), C2)
+define i1 @bcmp5(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp5(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 5)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (xor a, b), (and (xor c, d), C2)
+define i1 @bcmp6(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp6(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 6)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (xor a, b), (xor c, d)
+define i1 @bcmp7(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp7(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 7)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp8(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp8(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 8)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (xor a, b), (and (xor c, d), C2)
+define i1 @bcmp9(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp9(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 9)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp10(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp10(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 10)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp11(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp11(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 11)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp12(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp12(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 12)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp13(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp13(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 13)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp14(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp14(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 14)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp15(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp15(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 7
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 7
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 15)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp16(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp16(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 16)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp20(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp20(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP19]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 20)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp24(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp24(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP17]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 24)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp28(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp28(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i64 [[TMP13]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP25]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 28)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp33(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp33(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i8 [[TMP21]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP22]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP28]], [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i64 [[TMP29]], 0
+; CHECK-NEXT:    [[TMP31:%.*]] = zext i1 [[TMP30]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP31]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 33)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp38(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp38(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 30
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 30
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[TMP26]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i1 [[TMP28]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP29]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 38)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp45(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp45(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[A]], i64 37
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 37
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or i64 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i64 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i1 [[TMP34]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP35]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 45)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; Although the large cmp chain may be not profitable on high end CPU, we
+; believe it is better on most cpus, so perform the transform now.
+; 8 xor + 7 or + 1 cmp only need 6 cycles on a 4 width ALU port machine
+;   2 cycle for xor
+;   3 cycle for or
+;   1 cycle for cmp
+define i1 @bcmp64(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp64(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[A]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[A]], i64 48
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 48
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP31]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[A]], i64 56
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[B]], i64 56
+; CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP34]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP35]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = xor i64 [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or i64 [[TMP33]], [[TMP38]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or i64 [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP44:%.*]] = or i64 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = or i64 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = zext i1 [[TMP46]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP47]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 64)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp89(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp89(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @bcmp(ptr [[A]], ptr [[B]], i64 89)
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[CR]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 89)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp_zext(i32 %0, i32 %1, i8 %2, i8 %3) {
+; CHECK-LABEL: define i1 @bcmp_zext(
+; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i8 [[TMP2:%.*]], i8 [[TMP3:%.*]]) {
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i8 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    ret i1 [[TMP9]]
+;
+  %5 = xor i32 %1, %0
+  %6 = xor i8 %3, %2
+  %7 = zext i8 %6 to i32
+  %8 = or i32 %5, %7
+  %9 = icmp eq i32 %8, 0
+  ret i1 %9
+}
+
+define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) {
+; CHECK-LABEL: define i1 @bcmp_i8(
+; CHECK-SAME: i8 [[A0:%.*]], i8 [[B0:%.*]], i8 [[A1:%.*]], i8 [[B1:%.*]], i8 [[A2:%.*]], i8 [[B2:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i8 [[B0]], [[A0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i8 [[B1]], [[A1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i8 [[B2]], [[A2]]
+; CHECK-NEXT:    [[OR0:%.*]] = or i8 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[OR0]], [[XOR2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[OR1]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor0 = xor i8 %b0, %a0
+  %xor1 = xor i8 %b1, %a1
+  %xor2 = xor i8 %b2, %a2
+  %or0 = or i8 %xor0, %xor1
+  %or1 = or i8 %or0, %xor2
+  %r = icmp eq i8 %or1, 0
+  ret i1 %r
+}
+
+define i1 @bcmp_i16(i16 %a0, i16 %b0, i16 %a1, i16 %b1, i16 %a2, i16 %b2) {
+; CHECK-LABEL: define i1 @bcmp_i16(
+; CHECK-SAME: i16 [[A0:%.*]], i16 [[B0:%.*]], i16 [[A1:%.*]], i16 [[B1:%.*]], i16 [[A2:%.*]], i16 [[B2:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i16 [[B0]], [[A0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i16 [[B1]], [[A1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i16 [[B2]], [[A2]]
+; CHECK-NEXT:    [[OR0:%.*]] = or i16 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i16 [[OR0]], [[XOR2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i16 [[OR1]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor0 = xor i16 %b0, %a0
+  %xor1 = xor i16 %b1, %a1
+  %xor2 = xor i16 %b2, %a2
+  %or0 = or i16 %xor0, %xor1
+  %or1 = or i16 %or0, %xor2
+  %r = icmp eq i16 %or1, 0
+  ret i1 %r
+}
+
+define i1 @bcmp_i128(i128 %a0, i128 %b0, i128 %a1, i128 %b1, i128 %a2, i128 %b2) {
+; CHECK-LABEL: define i1 @bcmp_i128(
+; CHECK-SAME: i128 [[A0:%.*]], i128 [[B0:%.*]], i128 [[A1:%.*]], i128 [[B1:%.*]], i128 [[A2:%.*]], i128 [[B2:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i128 [[B0]], [[A0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i128 [[B1]], [[A1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i128 [[B2]], [[A2]]
+; CHECK-NEXT:    [[OR0:%.*]] = or i128 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i128 [[OR0]], [[XOR2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i128 [[OR1]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor0 = xor i128 %b0, %a0
+  %xor1 = xor i128 %b1, %a1
+  %xor2 = xor i128 %b2, %a2
+  %or0 = or i128 %xor0, %xor1
+  %or1 = or i128 %or0, %xor2
+  %r = icmp ne i128 %or1, 0
+  ret i1 %r
+}
+
+define i1 @bcmp_i42(i42 %a0, i42 %b0, i42 %a1, i42 %b1, i42 %a2, i42 %b2) {
+; CHECK-LABEL: define i1 @bcmp_i42(
+; CHECK-SAME: i42 [[A0:%.*]], i42 [[B0:%.*]], i42 [[A1:%.*]], i42 [[B1:%.*]], i42 [[A2:%.*]], i42 [[B2:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i42 [[B0]], [[A0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i42 [[B1]], [[A1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i42 [[B2]], [[A2]]
+; CHECK-NEXT:    [[OR0:%.*]] = or i42 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i42 [[OR0]], [[XOR2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i42 [[OR1]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor0 = xor i42 %b0, %a0
+  %xor1 = xor i42 %b1, %a1
+  %xor2 = xor i42 %b2, %a2
+  %or0 = or i42 %xor0, %xor1
+  %or1 = or i42 %or0, %xor2
+  %r = icmp ne i42 %or1, 0
+  ret i1 %r
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp-extra.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp-extra.ll
new file mode 100644
index 00000000000000..e9573816c97880
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp-extra.ll
@@ -0,0 +1,3434 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp  < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length0(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length0_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i1 true
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length0_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i1 false
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length2(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length2_const(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_gt_const(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length2_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length3(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i24, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i24, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i24 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i24 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length3_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length4(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length4_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length4_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4_lt_32(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length4_lt_32(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = lshr i32 %m, 31
+  ret i32 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length4_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length4_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length5(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i40, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i40, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i40 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length5_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length5_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i40, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i40, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i40 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length6(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length6(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i48, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i48, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i48 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
+  ret i32 %m
+}
+
+define i32 @length6_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length6_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i48, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i48, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i48 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
+  %r = lshr i32 %m, 31
+  ret i32 %r
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length7(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length7_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length7_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    ret i1 [[TMP10]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length8(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length8_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length8_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length9(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length9(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP5:%.*]], [[TMP6:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  ret i32 %m
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length9_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length10(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length10(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; CHECK-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  ret i32 %m
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length10_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length11(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length11(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  ret i32 %m
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length11_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length12_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length12(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length13_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length14_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length15(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length15_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length15_const(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; CHECK-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length15_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length15_gt_const(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; CHECK-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length16(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length16_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length16_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length16_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length16_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length24(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length24_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length24_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length24_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length24_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; CHECK-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length31(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length31_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP23]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length31_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length31_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: define i1 @length31_eq_prefer128(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP23]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length31_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3474870397276861491
+; CHECK-NEXT:    [[TMP12:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; CHECK-NEXT:    ret i1 [[TMP15]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length32(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length32_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP23]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length32_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length32_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: define i1 @length32_eq_prefer128(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP23]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length32_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3544395820347831604
+; CHECK-NEXT:    [[TMP12:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; CHECK-NEXT:    ret i1 [[TMP15]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length48(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB5]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length48_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or i64 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i64 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i1 [[TMP34]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP35]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length48_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB5]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length48_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB5]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: define i1 @length48_eq_prefer128(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or i64 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i64 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i1 [[TMP34]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP35]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length48_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3544395820347831604
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 4123106164818064178
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 3978425819141910832
+; CHECK-NEXT:    [[TMP18:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i64 [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i1 [[TMP23]] to i32
+; CHECK-NEXT:    ret i1 [[TMP23]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length63(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 55
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length63_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP31]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[Y]], i64 55
+; CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP34]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP35]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = xor i64 [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or i64 [[TMP33]], [[TMP38]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or i64 [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP44:%.*]] = or i64 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = or i64 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = zext i1 [[TMP46]] to i32
+; CHECK-NEXT:    ret i1 [[TMP46]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length63_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 55
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length63_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 55
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length63_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3544395820347831604
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 4123106164818064178
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 3978425819141910832
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 3833745473465760056
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 3616724998069630517
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or i64 [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = zext i1 [[TMP31]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP32]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length64(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 56
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length64_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP31]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[Y]], i64 56
+; CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP34]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP35]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = xor i64 [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or i64 [[TMP33]], [[TMP38]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or i64 [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP44:%.*]] = or i64 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = or i64 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = zext i1 [[TMP46]] to i32
+; CHECK-NEXT:    ret i1 [[TMP46]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length64_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 56
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length64_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 56
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length64_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3544395820347831604
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 4123106164818064178
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 3978425819141910832
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 3833745473465760056
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 3689065127958034230
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or i64 [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = zext i1 [[TMP31]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP32]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length96(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length96_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length96_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length96_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length96_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length127(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length127_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length127_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length127_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length127_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length128(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length128_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length128_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length128_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length128_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length192(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length192_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length192_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length192_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length192_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length255(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length255_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length255_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length255_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length255_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length256(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length256_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length256_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length256_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length256_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length384(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length384_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length384_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length384_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length384_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length511(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length511_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length511_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length511_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length511_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length512(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length512_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length512_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length512_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length512_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @huge_length(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @huge_length_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
+; CHECK-LABEL: define i32 @nonconst_length(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
+; CHECK-LABEL: define i1 @nonconst_length_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
index 92439691e1873c..735fb27da16060 100644
--- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
 ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
 
 declare i32 @memcmp(ptr nocapture, ptr nocapture, i64)
diff --git a/llvm/test/Transforms/ExpandMemCmp/BPF/lit.local.cfg b/llvm/test/Transforms/ExpandMemCmp/BPF/lit.local.cfg
new file mode 100644
index 00000000000000..d1828f2b613d9e
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/BPF/lit.local.cfg
@@ -0,0 +1,4 @@
+if not "BPF" in config.root.targets:
+    config.unsupported = True
+if "system-aix" in config.available_features:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/ExpandMemCmp/BPF/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/BPF/memcmp.ll
new file mode 100644
index 00000000000000..1accfe88d1a82a
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/BPF/memcmp.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=bpf < %s | FileCheck %s --check-prefix=BPF
+; RUN: opt -S -passes=expand-memcmp -mtriple=bpf -mcpu=v3 < %s | FileCheck %s --check-prefix=BPF-V3
+;
+; Source code:
+;   /* set aligned 4 to minimize the number of loads */
+;   struct build_id {
+;     unsigned char id[20];
+;   } __attribute__((aligned(4)));
+;
+;   /* try to compute a local build_id */
+;   void bar1(ptr);
+;
+;   /* the global build_id to compare */
+;   struct build_id id2;
+;
+;   int foo()
+;   {
+;     struct build_id id1;
+;
+;     bar1(&id1);
+;     return __builtin_memcmp(&id1, &id2, sizeof(id1)) == 0;
+;   }
+; Compilation flags:
+;   clang -target bpf -S -O2 t.c -emit-llvm
+
+%struct.build_id = type { [20 x i8] }
+
+ at id2 = dso_local global %struct.build_id zeroinitializer, align 4
+
+; Function Attrs: noinline nounwind
+define dso_local i32 @foo() #0 {
+; BPF-LABEL: define dso_local i32 @foo(
+; BPF-SAME: ) #[[ATTR0:[0-9]+]] {
+; BPF-NEXT:  entry:
+; BPF-NEXT:    [[ID1:%.*]] = alloca [[STRUCT_BUILD_ID:%.*]], align 4
+; BPF-NEXT:    call void @bar1(ptr noundef [[ID1]])
+; BPF-NEXT:    br label [[LOADBB:%.*]]
+; BPF:       res_block:
+; BPF-NEXT:    br label [[ENDBLOCK:%.*]]
+; BPF:       loadbb:
+; BPF-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID1]], align 4
+; BPF-NEXT:    [[TMP1:%.*]] = load i64, ptr @id2, align 4
+; BPF-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], [[TMP1]]
+; BPF-NEXT:    br i1 [[TMP2]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; BPF:       loadbb1:
+; BPF-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[ID1]], i64 8
+; BPF-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 4
+; BPF-NEXT:    [[TMP5:%.*]] = load i64, ptr getelementptr (i8, ptr @id2, i64 8), align 4
+; BPF-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP4]], [[TMP5]]
+; BPF-NEXT:    br i1 [[TMP6]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; BPF:       loadbb2:
+; BPF-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[ID1]], i64 16
+; BPF-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+; BPF-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr (i8, ptr @id2, i64 16), align 4
+; BPF-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP8]], [[TMP9]]
+; BPF-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; BPF:       endblock:
+; BPF-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; BPF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; BPF-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; BPF-NEXT:    ret i32 [[CONV]]
+;
+; BPF-V3-LABEL: define dso_local i32 @foo(
+; BPF-V3-SAME: ) #[[ATTR0:[0-9]+]] {
+; BPF-V3-NEXT:  entry:
+; BPF-V3-NEXT:    [[ID1:%.*]] = alloca [[STRUCT_BUILD_ID:%.*]], align 4
+; BPF-V3-NEXT:    call void @bar1(ptr noundef [[ID1]])
+; BPF-V3-NEXT:    br label [[LOADBB:%.*]]
+; BPF-V3:       res_block:
+; BPF-V3-NEXT:    br label [[ENDBLOCK:%.*]]
+; BPF-V3:       loadbb:
+; BPF-V3-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID1]], align 4
+; BPF-V3-NEXT:    [[TMP1:%.*]] = load i64, ptr @id2, align 4
+; BPF-V3-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], [[TMP1]]
+; BPF-V3-NEXT:    br i1 [[TMP2]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; BPF-V3:       loadbb1:
+; BPF-V3-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[ID1]], i64 8
+; BPF-V3-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 4
+; BPF-V3-NEXT:    [[TMP5:%.*]] = load i64, ptr getelementptr (i8, ptr @id2, i64 8), align 4
+; BPF-V3-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP4]], [[TMP5]]
+; BPF-V3-NEXT:    br i1 [[TMP6]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; BPF-V3:       loadbb2:
+; BPF-V3-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[ID1]], i64 16
+; BPF-V3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+; BPF-V3-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr (i8, ptr @id2, i64 16), align 4
+; BPF-V3-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP8]], [[TMP9]]
+; BPF-V3-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; BPF-V3:       endblock:
+; BPF-V3-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; BPF-V3-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; BPF-V3-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; BPF-V3-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %id1 = alloca %struct.build_id, align 4
+  call void @bar1(ptr noundef %id1)
+  %call = call i32 @memcmp(ptr noundef %id1, ptr noundef @id2, i64 noundef 20) #3
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+declare dso_local void @bar1(ptr noundef) #1
+
+; Function Attrs: nounwind
+declare dso_local i32 @memcmp(ptr noundef, ptr noundef, i64 noundef) #2
+
+attributes #0 = { noinline nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 18.0.0git (git at github.com:llvm/llvm-project.git a776740d6296520b8bde156aa3f8d9ecb32cddd9)"}
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/lit.local.cfg b/llvm/test/Transforms/ExpandMemCmp/PowerPC/lit.local.cfg
new file mode 100644
index 00000000000000..bb982488eb15ee
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "PowerPC" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memCmpUsedInZeroEqualityComparison.ll
new file mode 100644
index 00000000000000..9a75b147e7e1fb
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+ at zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
+ at zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
+ at zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
+ at zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
+ at zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
+ at zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
+ at zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
+ at zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
+
+declare signext i32 @memcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #1
+
+; Check 4 bytes - requires 1 load for each param.
+define signext i32 @zeroEqualityTest02(ptr %x, ptr %y) {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest02(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[DOT]]
+;
+  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 4)
+  %not.cmp = icmp ne i32 %call, 0
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+}
+
+; Check 16 bytes - requires 2 loads for each param (or use vectors?).
+define signext i32 @zeroEqualityTest01(ptr %x, ptr %y) {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest01(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp ne i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[NOT_TOBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[DOT]]
+;
+  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 16)
+  %not.tobool = icmp ne i32 %call, 0
+  %. = zext i1 %not.tobool to i32
+  ret i32 %.
+}
+
+; Check 7 bytes - requires 3 loads for each param.
+define signext i32 @zeroEqualityTest03(ptr %x, ptr %y) {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest03(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_LNOT:%.*]] = icmp ne i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[NOT_LNOT]] to i32
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 7)
+  %not.lnot = icmp ne i32 %call, 0
+  %cond = zext i1 %not.lnot to i32
+  ret i32 %cond
+}
+
+; Validate with > 0
+define signext i32 @zeroEqualityTest04() {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest04(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ 288230376151711744, [[LOADBB]] ], [ 0, [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 216172782113783808, [[LOADBB]] ], [ 0, [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    br i1 false, label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    br i1 true, label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_CMP:%.*]] = icmp slt i32 [[PHI_RES]], 1
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[NOT_CMP]] to i32
+; CHECK-NEXT:    ret i32 [[DOT]]
+;
+  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest02.buffer1, ptr @zeroEqualityTest02.buffer2, i64 16)
+  %not.cmp = icmp slt i32 %call, 1
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+}
+
+; Validate with < 0
+define signext i32 @zeroEqualityTest05() {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest05(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ 0, [[LOADBB]] ], [ 50331648, [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 0, [[LOADBB]] ], [ 67108864, [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    br i1 true, label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    br i1 false, label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CALL_LOBIT:%.*]] = lshr i32 [[PHI_RES]], 31
+; CHECK-NEXT:    [[CALL_LOBIT_NOT:%.*]] = xor i32 [[CALL_LOBIT]], 1
+; CHECK-NEXT:    ret i32 [[CALL_LOBIT_NOT]]
+;
+  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest03.buffer1, ptr @zeroEqualityTest03.buffer2, i64 16)
+  %call.lobit = lshr i32 %call, 31
+  %call.lobit.not = xor i32 %call.lobit, 1
+  ret i32 %call.lobit.not
+}
+
+; Validate with memcmp()?:
+define signext i32 @equalityFoldTwoConstants() {
+; CHECK-LABEL: define signext i32 @equalityFoldTwoConstants(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    br i1 false, label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    br i1 false, label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[NOT_TOBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr @zeroEqualityTest04.buffer2, i64 16)
+  %not.tobool = icmp eq i32 %call, 0
+  %cond = zext i1 %not.tobool to i32
+  ret i32 %cond
+}
+
+define signext i32 @equalityFoldOneConstant(ptr %X) {
+; CHECK-LABEL: define signext i32 @equalityFoldOneConstant(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 4294967296, [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 12884901890, [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[NOT_TOBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr %X, i64 16)
+  %not.tobool = icmp eq i32 %call, 0
+  %cond = zext i1 %not.tobool to i32
+  ret i32 %cond
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call signext i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call signext i32 @memcmp(ptr %X, ptr %Y, i64 2) nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll
new file mode 100644
index 00000000000000..ffc49478cfa4d3
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s
+
+; This tests interaction between MergeICmp and expand-memcmp.
+
+%"struct.std::pair" = type { i32, i32 }
+
+define zeroext i1 @opeq1(
+; CHECK-LABEL: define zeroext i1 @opeq1(
+; CHECK-SAME: ptr nocapture readonly dereferenceable(8) [[A:%.*]], ptr nocapture readonly dereferenceable(8) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[LAND_RHS_I:%.*]], label [[OPEQ1_EXIT:%.*]]
+; CHECK:       land.rhs.i:
+; CHECK-NEXT:    [[SECOND_I:%.*]] = getelementptr inbounds %"struct.std::pair", ptr [[A]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SECOND_I]], align 4
+; CHECK-NEXT:    [[SECOND2_I:%.*]] = getelementptr inbounds %"struct.std::pair", ptr [[B]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SECOND2_I]], align 4
+; CHECK-NEXT:    [[CMP3_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br label [[OPEQ1_EXIT]]
+; CHECK:       opeq1.exit:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP3_I]], [[LAND_RHS_I]] ]
+; CHECK-NEXT:    ret i1 [[TMP4]]
+;
+  ptr nocapture readonly dereferenceable(8) %a,
+  ptr nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, ptr %a, align 4
+  %1 = load i32, ptr %b, align 4
+  %cmp.i = icmp eq i32 %0, %1
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+  %second.i = getelementptr inbounds %"struct.std::pair", ptr %a, i64 0, i32 1
+  %2 = load i32, ptr %second.i, align 4
+  %second2.i = getelementptr inbounds %"struct.std::pair", ptr %b, i64 0, i32 1
+  %3 = load i32, ptr %second2.i, align 4
+  %cmp3.i = icmp eq i32 %2, %3
+  br label %opeq1.exit
+
+opeq1.exit:
+  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
+  ret i1 %4
+}
+
+
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp.ll
new file mode 100644
index 00000000000000..21cdbd65544c4c
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s
+
+define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
+; CHECK-LABEL: define signext i32 @memcmp8(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8)
+  ret i32 %call
+}
+
+define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
+; CHECK-LABEL: define signext i32 @memcmp4(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
+  ret i32 %call
+}
+
+define signext i32 @memcmp2(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
+; CHECK-LABEL: define signext i32 @memcmp2(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 2)
+  ret i32 %call
+}
+
+define signext i32 @memcmp1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
+; CHECK-LABEL: define signext i32 @memcmp1(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 1) #2
+  ret i32 %call
+}
+
+declare signext i32 @memcmp(ptr, ptr, i64)
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmpIR.ll b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmpIR.ll
new file mode 100644
index 00000000000000..3ad0c9d12ea0bc
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmpIR.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s
+
+define signext i32 @test1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
+; CHECK-LABEL: define signext i32 @test1(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP11:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BUFFER1]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[BUFFER2]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11]] = call i64 @llvm.bswap.i64(i64 [[TMP9]])
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP1]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+entry:
+
+
+
+  ; CHECK-BE-LABEL: @test1(
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+  ; CHECK-BE-LABEL: loadbb:{{.*}}
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
+
+  ; CHECK-BE-LABEL: loadbb1:{{.*}}
+  ; CHECK-BE-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
+  ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
+  ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]]
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %endblock, label %res_block
+
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 16)
+  ret i32 %call
+}
+
+declare signext i32 @memcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #1
+
+define signext i32 @test2(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
+; CHECK-LABEL: define signext i32 @test2(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+
+  ; CHECK-BE-LABEL: @test2(
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
+  ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
+  ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
+  ; CHECK-BE-NEXT: ret i32 [[SUB]]
+
+entry:
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
+  ret i32 %call
+}
+
+define signext i32 @test3(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
+; CHECK-LABEL: define signext i32 @test3(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1:%.*]] ], [ [[TMP22:%.*]], [[LOADBB2:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1]] ], [ [[TMP23:%.*]], [[LOADBB2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BUFFER1]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[BUFFER2]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[BUFFER1]], i64 12
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[BUFFER2]], i64 12
+; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP19]])
+; CHECK-NEXT:    [[TMP22]] = zext i16 [[TMP20]] to i64
+; CHECK-NEXT:    [[TMP23]] = zext i16 [[TMP21]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[LOADBB3:%.*]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[BUFFER1]], i64 14
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[BUFFER2]], i64 14
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = zext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP31]], [[LOADBB3]] ], [ [[TMP1]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+  ; CHECK-BE-LABEL: loadbb:{{.*}}
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb2, label %res_block
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, ptr
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb3, label %res_block
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, ptr
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT:  br label %endblock
+
+entry:
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 15)
+  ret i32 %call
+}
+  ; CHECK-BE: call = tail call signext i32 @memcmp
+define signext i32 @test4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
+; CHECK-LABEL: define signext i32 @test4(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call signext i32 @memcmp(ptr [[BUFFER1]], ptr [[BUFFER2]], i64 65)
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 65)
+  ret i32 %call
+}
+
+define signext i32 @test5(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2, i32 signext %SIZE)  {
+; CHECK-LABEL: define signext i32 @test5(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]], i32 signext [[SIZE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[SIZE]] to i64
+; CHECK-NEXT:    [[CALL:%.*]] = tail call signext i32 @memcmp(ptr [[BUFFER1]], ptr [[BUFFER2]], i64 [[CONV]])
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+  ; CHECK-BE: call = tail call signext i32 @memcmp
+entry:
+  %conv = sext i32 %SIZE to i64
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 %conv)
+  ret i32 %call
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll
index 41d357728b93e7..5877d00a818c5f 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll
@@ -1,16 +1,16 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64
 
 declare i32 @bcmp(ptr nocapture, ptr nocapture, i64)
 
 define i32 @bcmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @bcmp8(
-; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    ret i32 [[TMP6]]
+; X64-LABEL: define i32 @bcmp8(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i32 [[TMP4]]
 ;
   %call = tail call i32 @bcmp(ptr %x, ptr %y, i64 8)
   ret i32 %call
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-2.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-2.ll
new file mode 100644
index 00000000000000..4424488a7fffb1
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-2.ll
@@ -0,0 +1,20249 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4  -mtriple=x86_64-unknown-unknown               < %s | FileCheck %s --check-prefixes=X64
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=sse4.1 < %s | FileCheck %s --check-prefixes=X64-SSE41
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx    < %s | FileCheck %s --check-prefixes=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx2   < %s | FileCheck %s --check-prefixes=X64-AVX2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit < %s | FileCheck %s --check-prefixes=X64-AVX512BW-256
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit < %s | FileCheck %s --check-prefixes=X64-AVX512BW
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-AVX512F-256
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-AVX512F
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-MIC-AVX2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-MIC-AVX512F
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length0(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    ret i32 0
+;
+; X64-SSE41-LABEL: define i32 @length0(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-SSE41-NEXT:    ret i32 0
+;
+; X64-AVX1-LABEL: define i32 @length0(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX1-NEXT:    ret i32 0
+;
+; X64-AVX2-LABEL: define i32 @length0(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX2-NEXT:    ret i32 0
+;
+; X64-AVX512BW-256-LABEL: define i32 @length0(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512BW-256-NEXT:    ret i32 0
+;
+; X64-AVX512BW-LABEL: define i32 @length0(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512BW-NEXT:    ret i32 0
+;
+; X64-AVX512F-256-LABEL: define i32 @length0(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512F-256-NEXT:    ret i32 0
+;
+; X64-AVX512F-LABEL: define i32 @length0(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512F-NEXT:    ret i32 0
+;
+; X64-MIC-AVX2-LABEL: define i32 @length0(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-MIC-AVX2-NEXT:    ret i32 0
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length0(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-MIC-AVX512F-NEXT:    ret i32 0
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length0_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
+; X64-SSE41-LABEL: define i1 @length0_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    ret i1 true
+;
+; X64-AVX1-LABEL: define i1 @length0_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    ret i1 true
+;
+; X64-AVX2-LABEL: define i1 @length0_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    ret i1 true
+;
+; X64-AVX512BW-256-LABEL: define i1 @length0_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    ret i1 true
+;
+; X64-AVX512BW-LABEL: define i1 @length0_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    ret i1 true
+;
+; X64-AVX512F-256-LABEL: define i1 @length0_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    ret i1 true
+;
+; X64-AVX512F-LABEL: define i1 @length0_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    ret i1 true
+;
+; X64-MIC-AVX2-LABEL: define i1 @length0_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    ret i1 true
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length0_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    ret i1 true
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length0_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
+; X64-SSE41-LABEL: define i1 @length0_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    ret i1 false
+;
+; X64-AVX1-LABEL: define i1 @length0_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    ret i1 false
+;
+; X64-AVX2-LABEL: define i1 @length0_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    ret i1 false
+;
+; X64-AVX512BW-256-LABEL: define i1 @length0_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    ret i1 false
+;
+; X64-AVX512BW-LABEL: define i1 @length0_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    ret i1 false
+;
+; X64-AVX512F-256-LABEL: define i1 @length0_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    ret i1 false
+;
+; X64-AVX512F-LABEL: define i1 @length0_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    ret i1 false
+;
+; X64-MIC-AVX2-LABEL: define i1 @length0_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    ret i1 false
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length0_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    ret i1 false
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i32 @length2(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length2(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512BW-LABEL: define i32 @length2(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length2(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512F-LABEL: define i32 @length2(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP7]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length2(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length2(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP7]]
+;
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length2_const(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-NEXT:    ret i32 [[TMP4]]
+;
+; X64-SSE41-LABEL: define i32 @length2_const(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-SSE41-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX1-LABEL: define i32 @length2_const(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX1-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX2-LABEL: define i32 @length2_const(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX2-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length2_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX512BW-LABEL: define i32 @length2_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512BW-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length2_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX512F-LABEL: define i32 @length2_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512F-NEXT:    ret i32 [[TMP4]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length2_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP4]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length2_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP4]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_gt_const(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_gt_const(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_gt_const(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_gt_const(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_gt_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_gt_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_gt_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_gt_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_gt_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_gt_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length3(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length3(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length3(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length3(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length3(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length3(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length3(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length3_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length3_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length3_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length3_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length3_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length3_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length3_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-SSE41-LABEL: define i32 @length4(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length4(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-LABEL: define i32 @length4(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length4(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-LABEL: define i32 @length4(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length4(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length4(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-SSE41-LABEL: define i1 @length4_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    ret i1 [[TMP5]]
+;
+; X64-SSE41-LABEL: define i1 @length4_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX1-LABEL: define i1 @length4_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX2-LABEL: define i1 @length4_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    ret i1 [[TMP5]]
+;
+; X64-SSE41-LABEL: define i1 @length4_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX1-LABEL: define i1 @length4_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX2-LABEL: define i1 @length4_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length4_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length5(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length5(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length5(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length5(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length5(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length5(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length5(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length5_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length5_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length5_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length5_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length5_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length5_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length5_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length5_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length5_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length5_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length5_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length5_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length5_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length5_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length5_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length5_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length5_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length7(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length7(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length7(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length7(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length7(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length7(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length7(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length7(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length7(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length7(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length7_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length7_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length7_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length7_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length7_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length7_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length7_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length7_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length7_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length7_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length7_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    ret i1 [[TMP10]]
+;
+; X64-SSE41-LABEL: define i1 @length7_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX1-LABEL: define i1 @length7_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length7_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length7_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length7_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length7_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length7_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length7_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length7_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-SSE41-LABEL: define i32 @length8(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length8(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-LABEL: define i32 @length8(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length8(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-LABEL: define i32 @length8(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length8(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length8(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length8_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length8_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length8_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length8_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length8_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length8_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length8_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-SSE41-LABEL: define i1 @length8_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length8_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length8_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length8_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length8_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length8_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length9_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length9_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length9_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length9_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length9_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length9_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length9_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length9_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length9_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length9_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length10_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length10_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length10_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length10_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length10_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length10_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length10_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length10_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length10_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length10_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length11_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length11_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length11_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length11_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length11_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length11_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length11_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length11_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length11_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length11_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length12_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length12_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length12_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length12_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length12_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length12_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length12_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length12(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-SSE41-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-SSE41-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX1-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX1-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length12(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length12(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512BW-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length12(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length12(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512F-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512F-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length12(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length12(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length13_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length13_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length13_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length13_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length13_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length13_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length13_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length13_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length13_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length13_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length14_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length14_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length14_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length14_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length14_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length14_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length14_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length14_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length14_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length14_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length15(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length15(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length15(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length15(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length15(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length15(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length15(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length15(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length15(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length15(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length15_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length15_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length15_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length15_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length15_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length15_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length15_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length15_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length15_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length15_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15_const(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length15_const(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length15_const(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-SSE41-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-SSE41-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length15_const(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX1-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX1-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length15_const(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX2-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX2-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length15_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length15_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512BW-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512BW-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length15_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512F-256-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512F-256-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length15_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512F-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512F-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512F-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length15_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length15_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length15_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length15_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length15_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length15_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length15_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length15_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length15_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length15_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length15_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length15_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length15_gt_const(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length15_gt_const(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-SSE41-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-SSE41-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length15_gt_const(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX1-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX1-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length15_gt_const(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX2-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX2-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length15_gt_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length15_gt_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512BW-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512BW-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length15_gt_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512F-256-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512F-256-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length15_gt_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512F-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512F-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512F-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length15_gt_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length15_gt_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length16(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length16(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length16(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length16(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length16(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length16(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length16(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-SSE41-LABEL: define i1 @length16_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length16_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length16_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length16_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length16_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length16_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length16_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length16_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length24(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length24(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length24(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length24(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length24(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length24(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length24(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length24_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length24_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length24_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-NEXT:    ret i1 [[TMP8]]
+;
+; X64-SSE41-LABEL: define i1 @length24_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length24_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length31(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length31(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length31(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length31(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length31(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length31(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length31(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length31(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length31(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length31(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length31_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length31_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length31_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length31_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length31_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length31_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length31_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP7]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP7]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX-LABEL: length31_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [943142453,842084409,909456435,809056311]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length32(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length32(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length32(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length32(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length32(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length32(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length32(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512-LABEL: length32_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length32_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length32_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length32_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length32_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512-LABEL: length32_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length48(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length48(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length48(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length48(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length48(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length48(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length48(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length48(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length48(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length48(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length48_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $48, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length48_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512-LABEL: length48_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-AVX512-NEXT:    vmovdqu 32(%rsi), %xmm2
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length48_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length48_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X64-LABEL: define i1 @length48_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length48_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $48, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length48_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X64-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-NEXT:    ret i1 [[TMP11]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP11]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512-LABEL: length48_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length63(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length63(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length63(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length63(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length63(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length63(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length63(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length63(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length63(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length63(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length63_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $63, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length63_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X64-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-NEXT:    ret i1 [[TMP22]]
+;
+; X64-SSE41-LABEL: define i1 @length63_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX1-LABEL: define i1 @length63_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length63_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512-LABEL: length63_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-AVX512-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length63_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm2
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rsi), %ymm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length63_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length63_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length63_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length63_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length63_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length63_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length63_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length63_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length63_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $63, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length63_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X64-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length63_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length63_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length63_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512-LABEL: length63_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length63_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [875770417,943142453,842084409,909456435,809056311,875770417,943142453,842084409]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length64(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length64(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length64(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length64(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length64(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length64(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length64(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length64_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $64, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X64-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-NEXT:    ret i1 [[TMP22]]
+;
+; X64-SSE41-LABEL: define i1 @length64_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512-LABEL: length64_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-AVX512-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
+; X64-AVX512-NEXT:    kortestw %k0, %k0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length64_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length64_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length64_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length64_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length64_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $64, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X64-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length64_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512-LABEL: length64_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-AVX512-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
+; X64-AVX512-NEXT:    kortestw %k0, %k0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length96(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length96(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length96(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length96(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length96(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length96(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length96(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length96(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length96(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length96(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length96_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $96, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length96_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX2-LABEL: define i1 @length96_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP16]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length96_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length96_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length96_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length96_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length96_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $96, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length96_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length96_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length96_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length96_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length127(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length127(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length127(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length127(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length127(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length127(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length127(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length127(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length127(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length127(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length127_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $127, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length127_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX1-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX2-LABEL: define i1 @length127_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512F-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length127_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length127_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length127_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length127_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length127_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $127, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length127_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length127_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length127_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length127_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length128_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $128, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length128_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX1-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX2-LABEL: define i1 @length128_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512F-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length128_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length128_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length128_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length128_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length128_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $128, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length128_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length128_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length128_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length128_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length192(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length192(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length192(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length192(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length192(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length192(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length192(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length192(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length192(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length192(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP16]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP16]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length192_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length192_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length192_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length192_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length255(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length255(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length255(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length255(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length255(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length255(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length255(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length255(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length255(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length255(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length255_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length255_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length255_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length255_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length256(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length256(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length256(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length256(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length256(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length256(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length256(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length256(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length256(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length256(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length256_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length256_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length256_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length256_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length384(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length384(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length384(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length384(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length384(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length384(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length384(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length384(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length384(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length384(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length384_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length384_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length384_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length384_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length511(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length511(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length511(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length511(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length511(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length511(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length511(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length511(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length511(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length511(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length511_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length511_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length511_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length511_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length512(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length512(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length512(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length512(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length512(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length512(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length512(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length512(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length512(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length512(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length512_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length512_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length512_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length512_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @huge_length(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @huge_length(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @huge_length(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @huge_length(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @huge_length(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @huge_length(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @huge_length(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @huge_length(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @huge_length(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @huge_length(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @huge_length_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @huge_length_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @huge_length_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @huge_length_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @huge_length_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @huge_length_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @huge_length_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @huge_length_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @huge_length_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @huge_length_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
+; X64-LABEL: define i32 @nonconst_length(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @nonconst_length(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @nonconst_length(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @nonconst_length(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @nonconst_length(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @nonconst_length(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @nonconst_length(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @nonconst_length(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @nonconst_length(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @nonconst_length(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
+; X64-LABEL: define i1 @nonconst_length_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @nonconst_length_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @nonconst_length_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @nonconst_length_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/CodeGen/X86/memcmp-constant.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-constant.ll
similarity index 50%
rename from llvm/test/CodeGen/X86/memcmp-constant.ll
rename to llvm/test/Transforms/ExpandMemCmp/X86/memcmp-constant.ll
index 2059b8f8040827..908c6b34183e57 100644
--- a/llvm/test/CodeGen/X86/memcmp-constant.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-constant.ll
@@ -1,5 +1,7 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown               | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
+; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD
+
 
 @.str1 = private constant [4 x i8] c"\00\00\00\00", align 1
 @.str2 = private constant [4 x i8] c"\ff\ff\ff\ff", align 1
@@ -7,49 +9,49 @@
 declare i32 @memcmp(ptr, ptr, i64)
 
 define i32 @length4_same() nounwind {
-; CHECK-LABEL: length4_same:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i32 @length4_same(
+; X64-SAME: ) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    ret i32 0
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   ret i32 %m
 }
 
 define i1 @length4_same_lt() nounwind {
-; CHECK-LABEL: length4_same_lt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_same_lt(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   %c = icmp slt i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_same_gt() nounwind {
-; CHECK-LABEL: length4_same_gt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_same_gt(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   %c = icmp sgt i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_same_le() nounwind {
-; CHECK-LABEL: length4_same_le:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_same_le(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   %c = icmp sle i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_same_ge() nounwind {
-; CHECK-LABEL: length4_same_ge:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_same_ge(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   %c = icmp sge i32 %m, 0
   ret i1 %c
@@ -57,52 +59,55 @@ define i1 @length4_same_ge() nounwind {
 
 
 define i32 @length4() nounwind {
-; CHECK-LABEL: length4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl $-1, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i32 -1
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   ret i32 %m
 }
 
 define i1 @length4_lt() nounwind {
-; CHECK-LABEL: length4_lt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_lt(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   %c = icmp slt i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_gt() nounwind {
-; CHECK-LABEL: length4_gt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_gt(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   %c = icmp sgt i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_le() nounwind {
-; CHECK-LABEL: length4_le:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_le(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   %c = icmp sle i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_ge() nounwind {
-; CHECK-LABEL: length4_ge:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_ge(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   %c = icmp sge i32 %m, 0
   ret i1 %c
 }
 
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64_1LD: {{.*}}
+; X64_2LD: {{.*}}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize-x32.ll
new file mode 100644
index 00000000000000..edd70ddb445dcc
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize-x32.ll
@@ -0,0 +1,493 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=cmov < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s  --check-prefix=X86-SSE2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR2:[0-9]+]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR2:[0-9]+]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 3) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 3) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 3) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 3) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 4) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 4) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 4) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 4) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 5) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 5) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 5) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 5) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 8) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 8) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 8) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 8) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 8) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 8) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR2]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind minsize {
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR2]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR2]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR2]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize.ll
new file mode 100644
index 00000000000000..431dc158962996
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize.ll
@@ -0,0 +1,707 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=X64
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s --check-prefix=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR1:[0-9]+]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 8) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 8) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 8) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
+;
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR1]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind minsize {
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 16) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 16) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 16) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR1]]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 24) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 24) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 24) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    pushq $32
+; X64-SSE2-NEXT:    popq %rdx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR1]]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind minsize {
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    pushq $32
+; X64-SSE2-NEXT:    popq %rdx
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 32) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 32) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 32) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR1]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs-x32.ll
new file mode 100644
index 00000000000000..abdadb14086c20
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs-x32.ll
@@ -0,0 +1,6203 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=i686-unknown-unknown -mattr=cmov     < %s | FileCheck %s --check-prefixes=X86
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=i686-unknown-unknown -mattr=+sse     < %s | FileCheck %s --check-prefixes=X86-SSE1
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=i686-unknown-unknown -mattr=+sse2    < %s | FileCheck %s --check-prefixes=X86-SSE2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=i686-unknown-unknown -mattr=+sse4.1  < %s | FileCheck %s --check-prefixes=X86-SSE41
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length0(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-NEXT:    ret i32 0
+;
+; X86-SSE1-LABEL: define i32 @length0(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE1-NEXT:    ret i32 0
+;
+; X86-SSE2-LABEL: define i32 @length0(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE2-NEXT:    ret i32 0
+;
+; X86-SSE41-LABEL: define i32 @length0(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE41-NEXT:    ret i32 0
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length0_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    ret i1 true
+;
+; X86-SSE1-LABEL: define i1 @length0_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    ret i1 true
+;
+; X86-SSE2-LABEL: define i1 @length0_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    ret i1 true
+;
+; X86-SSE41-LABEL: define i1 @length0_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    ret i1 true
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length0_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    ret i1 false
+;
+; X86-SSE1-LABEL: define i1 @length0_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    ret i1 false
+;
+; X86-SSE2-LABEL: define i1 @length0_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    ret i1 false
+;
+; X86-SSE41-LABEL: define i1 @length0_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    ret i1 false
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE1-LABEL: define i32 @length2(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i32 @length2(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length3(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length3(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE1-LABEL: define i1 @length3_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE41-LABEL: define i1 @length3_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE1-LABEL: define i32 @length4(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE1-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE2-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE41-LABEL: define i32 @length4(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE1-LABEL: define i1 @length4_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE41-LABEL: define i1 @length4_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE1-LABEL: define i1 @length4_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE2-LABEL: define i1 @length4_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE41-LABEL: define i1 @length4_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE1-LABEL: define i1 @length4_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE2-LABEL: define i1 @length4_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE41-LABEL: define i1 @length4_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length4_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length4_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length5(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length5(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE1-LABEL: define i1 @length5_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE41-LABEL: define i1 @length5_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length5_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length5_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length5_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length5_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length7(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length7(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length7(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length7(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length7_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE1-LABEL: define i1 @length7_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE2-LABEL: define i1 @length7_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE41-LABEL: define i1 @length7_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP10]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length7_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length7_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length7_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length7_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length8(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length8(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length8_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length8_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE1-LABEL: define i1 @length8_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length8_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length9_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X86-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length9_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length9_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length9_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length10_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP12]] to i32
+; X86-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length10_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP12]] to i32
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length10_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP12]] to i32
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length10_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP12]] to i32
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length11_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length11_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length11_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length11_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-NEXT:    ret i1 [[TMP16]]
+;
+; X86-SSE1-LABEL: define i1 @length12_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP16]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP16]]
+;
+; X86-SSE41-LABEL: define i1 @length12_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP16]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86:       loadbb2:
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length12(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb2:
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE1-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE1-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb2:
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE2-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE2-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length12(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb2:
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE41-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE41-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length13_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP14]], align 1
+; X86-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP16]] to i32
+; X86-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP17]] to i32
+; X86-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length13_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP14]], align 1
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP16]] to i32
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP17]] to i32
+; X86-SSE1-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length13_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP17]] to i32
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length13_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP17]] to i32
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length14_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP14]], align 1
+; X86-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP16]] to i32
+; X86-NEXT:    [[TMP19:%.*]] = zext i16 [[TMP17]] to i32
+; X86-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length14_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP14]], align 1
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP16]] to i32
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = zext i16 [[TMP17]] to i32
+; X86-SSE1-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length14_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = zext i16 [[TMP17]] to i32
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length14_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = zext i16 [[TMP17]] to i32
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length15_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 11
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 11
+; X86-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP23]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length15_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 11
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 11
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP23]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length15_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 11
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 11
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP23]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length15_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 11
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 11
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP23]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86:       loadbb2:
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86:       loadbb3:
+; X86-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length16(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb2:
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE1-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE1-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb3:
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE1-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE1-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE1-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb2:
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE2-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE2-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb3:
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE2-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE2-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE2-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length16(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb2:
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE41-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE41-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb3:
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE41-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE41-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl (%edx), %esi
+; X86-NOSSE-NEXT:    movl 4(%edx), %eax
+; X86-NOSSE-NEXT:    xorl (%ecx), %esi
+; X86-NOSSE-NEXT:    xorl 4(%ecx), %eax
+; X86-NOSSE-NEXT:    orl %esi, %eax
+; X86-NOSSE-NEXT:    movl 8(%edx), %esi
+; X86-NOSSE-NEXT:    xorl 8(%ecx), %esi
+; X86-NOSSE-NEXT:    movl 12(%edx), %edx
+; X86-NOSSE-NEXT:    xorl 12(%ecx), %edx
+; X86-NOSSE-NEXT:    orl %esi, %edx
+; X86-NOSSE-NEXT:    orl %eax, %edx
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-NEXT:    ret i1 [[TMP22]]
+;
+; X86-SSE1-LABEL: define i1 @length16_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP22]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE41-LABEL: define i1 @length16_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length16_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86:       loadbb2:
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86:       loadbb3:
+; X86-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb2:
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE1-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE1-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb3:
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE1-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE1-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE1-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb2:
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE2-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE2-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb3:
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE2-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE2-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE2-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length16_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb2:
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE41-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE41-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb3:
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE41-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE41-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length16_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86:       loadbb2:
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86:       loadbb3:
+; X86-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb2:
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE1-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE1-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb3:
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE1-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE1-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE1-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb2:
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE2-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE2-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb3:
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE2-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE2-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE2-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length16_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb2:
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE41-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE41-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb3:
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE41-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE41-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NOSSE-NEXT:    xorl (%eax), %ecx
+; X86-NOSSE-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NOSSE-NEXT:    xorl 4(%eax), %edx
+; X86-NOSSE-NEXT:    orl %ecx, %edx
+; X86-NOSSE-NEXT:    movl $825243960, %ecx # imm = 0x31303938
+; X86-NOSSE-NEXT:    xorl 8(%eax), %ecx
+; X86-NOSSE-NEXT:    movl $892613426, %esi # imm = 0x35343332
+; X86-NOSSE-NEXT:    xorl 12(%eax), %esi
+; X86-NOSSE-NEXT:    orl %ecx, %esi
+; X86-NOSSE-NEXT:    orl %edx, %esi
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], 825243960
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = xor i32 [[TMP10]], 892613426
+; X86-NEXT:    [[TMP12:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP13:%.*]] = or i32 [[TMP8]], [[TMP11]]
+; X86-NEXT:    [[TMP14:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP14]], 0
+; X86-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length16_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], 825243960
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = xor i32 [[TMP10]], 892613426
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = or i32 [[TMP8]], [[TMP11]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP14]], 0
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length16_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5:[0-9]+]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length24(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5:[0-9]+]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5:[0-9]+]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length24(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5:[0-9]+]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length24_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length24_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length24_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length24_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length31(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length31(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length31(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length31(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length31_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length31_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length31_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length31_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length31_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 31) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length32(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length32(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length32_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length32_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length32_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length48(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length48(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length48(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length48(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length48_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $48
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length48_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length48_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $48
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length48_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length48_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $48
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length48_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP11]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length63(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length63(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length63(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length63(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length63_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $63
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length63_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP22]]
+;
+; X86-SSE41-LABEL: define i1 @length63_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length63_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $63
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length63_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length63_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length63_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length63_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length64(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length64(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length64_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $64
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP22]]
+;
+; X86-SSE41-LABEL: define i1 @length64_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length64_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $64
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length64_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length64_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length96(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length96(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length96(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length96(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length96_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length96_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length96_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length96_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length127(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length127(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length127(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length127(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length127_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length127_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length127_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length127_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length128_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length128_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length128_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length128_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length192(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length192(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length192(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length192(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length192_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length192_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length192_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length192_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length255(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length255(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length255(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length255(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length255_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length255_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length255_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length255_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length256(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length256(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length256(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length256(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length256_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length256_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length256_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length256_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length384(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length384(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length384(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length384(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length384_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length384_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length384_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length384_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length511(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length511(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length511(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length511(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length511_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length511_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length511_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length511_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length512(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length512(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length512(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length512(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length512_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length512_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length512_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length512_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @huge_length(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @huge_length(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @huge_length(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @huge_length(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @huge_length_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @huge_length_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @huge_length_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @huge_length_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
+; X86-LABEL: define i32 @nonconst_length(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @nonconst_length(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @nonconst_length(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @nonconst_length(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
+; X86-LABEL: define i1 @nonconst_length_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs.ll
new file mode 100644
index 00000000000000..56489a08800b76
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs.ll
@@ -0,0 +1,18833 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4  -mtriple=x86_64-unknown-unknown               < %s | FileCheck %s --check-prefixes=X64
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=sse4.1 < %s | FileCheck %s --check-prefixes=X64-SSE41
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx    < %s | FileCheck %s --check-prefixes=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx2   < %s | FileCheck %s --check-prefixes=X64-AVX2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit < %s | FileCheck %s --check-prefixes=X64-AVX512BW-256
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit < %s | FileCheck %s --check-prefixes=X64-AVX512BW
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-AVX512F-256
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-AVX512F
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-MIC-AVX2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-MIC-AVX512F
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length0(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    ret i32 0
+;
+; X64-SSE41-LABEL: define i32 @length0(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-SSE41-NEXT:    ret i32 0
+;
+; X64-AVX1-LABEL: define i32 @length0(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX1-NEXT:    ret i32 0
+;
+; X64-AVX2-LABEL: define i32 @length0(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX2-NEXT:    ret i32 0
+;
+; X64-AVX512BW-256-LABEL: define i32 @length0(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512BW-256-NEXT:    ret i32 0
+;
+; X64-AVX512BW-LABEL: define i32 @length0(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512BW-NEXT:    ret i32 0
+;
+; X64-AVX512F-256-LABEL: define i32 @length0(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512F-256-NEXT:    ret i32 0
+;
+; X64-AVX512F-LABEL: define i32 @length0(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512F-NEXT:    ret i32 0
+;
+; X64-MIC-AVX2-LABEL: define i32 @length0(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-MIC-AVX2-NEXT:    ret i32 0
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length0(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-MIC-AVX512F-NEXT:    ret i32 0
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length0_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
+; X64-SSE41-LABEL: define i1 @length0_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    ret i1 true
+;
+; X64-AVX1-LABEL: define i1 @length0_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    ret i1 true
+;
+; X64-AVX2-LABEL: define i1 @length0_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    ret i1 true
+;
+; X64-AVX512BW-256-LABEL: define i1 @length0_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    ret i1 true
+;
+; X64-AVX512BW-LABEL: define i1 @length0_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    ret i1 true
+;
+; X64-AVX512F-256-LABEL: define i1 @length0_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    ret i1 true
+;
+; X64-AVX512F-LABEL: define i1 @length0_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    ret i1 true
+;
+; X64-MIC-AVX2-LABEL: define i1 @length0_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    ret i1 true
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length0_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    ret i1 true
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length0_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
+; X64-SSE41-LABEL: define i1 @length0_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    ret i1 false
+;
+; X64-AVX1-LABEL: define i1 @length0_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    ret i1 false
+;
+; X64-AVX2-LABEL: define i1 @length0_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    ret i1 false
+;
+; X64-AVX512BW-256-LABEL: define i1 @length0_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    ret i1 false
+;
+; X64-AVX512BW-LABEL: define i1 @length0_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    ret i1 false
+;
+; X64-AVX512F-256-LABEL: define i1 @length0_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    ret i1 false
+;
+; X64-AVX512F-LABEL: define i1 @length0_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    ret i1 false
+;
+; X64-MIC-AVX2-LABEL: define i1 @length0_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    ret i1 false
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length0_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    ret i1 false
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i32 @length2(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length2(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512BW-LABEL: define i32 @length2(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length2(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512F-LABEL: define i32 @length2(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP7]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length2(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length2(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP7]]
+;
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length3(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length3(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length3(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length3(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length3(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length3(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length3(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length3_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length3_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length3_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length3_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length3_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length3_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length3_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-SSE41-LABEL: define i32 @length4(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length4(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-LABEL: define i32 @length4(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length4(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-LABEL: define i32 @length4(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length4(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length4(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-SSE41-LABEL: define i1 @length4_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    ret i1 [[TMP5]]
+;
+; X64-SSE41-LABEL: define i1 @length4_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX1-LABEL: define i1 @length4_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX2-LABEL: define i1 @length4_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    ret i1 [[TMP5]]
+;
+; X64-SSE41-LABEL: define i1 @length4_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX1-LABEL: define i1 @length4_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX2-LABEL: define i1 @length4_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length4_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length5(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length5(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length5(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length5(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length5(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length5(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length5(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length5_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length5_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length5_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length5_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length5_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length5_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length5_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length5_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length5_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length5_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length5_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length5_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length5_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length5_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length5_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length5_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length5_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length7(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length7(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length7(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length7(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length7(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length7(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length7(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length7(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length7(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length7(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length7_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    ret i1 [[TMP10]]
+;
+; X64-SSE41-LABEL: define i1 @length7_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX1-LABEL: define i1 @length7_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length7_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length7_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length7_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length7_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length7_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length7_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length7_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length7_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length7_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length7_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length7_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length7_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length7_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length7_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length7_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length7_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length7_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-SSE41-LABEL: define i32 @length8(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length8(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-LABEL: define i32 @length8(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length8(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-LABEL: define i32 @length8(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length8(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length8(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length8_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length8_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length8_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length8_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length8_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length8_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length8_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-SSE41-LABEL: define i1 @length8_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length8_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length8_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length8_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length8_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length8_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length9_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length9_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length9_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length9_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length9_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length9_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length9_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length9_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length9_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length9_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length10_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length10_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length10_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length10_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length10_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length10_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length10_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length10_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length10_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length10_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length11_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length11_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length11_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length11_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length11_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length11_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length11_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length11_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length11_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length11_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length12_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length12_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length12_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length12_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length12_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length12_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length12_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length12(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-SSE41-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-SSE41-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX1-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX1-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length12(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length12(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512BW-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length12(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length12(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512F-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512F-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length12(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length12(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length13_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length13_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length13_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length13_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length13_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length13_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length13_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length13_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length13_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length13_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length14_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length14_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length14_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length14_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length14_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length14_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length14_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length14_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length14_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length14_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length15_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length15_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length15_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length15_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length15_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length15_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length15_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length15_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length15_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length15_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length16(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length16(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length16(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length16(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length16(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length16(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length16(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-SSE41-LABEL: define i1 @length16_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length16_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length16_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length16_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length16_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length16_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length16_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length16_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length24(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length24(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length24(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length24(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length24(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length24(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length24(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length24_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length24_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length24_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-NEXT:    ret i1 [[TMP8]]
+;
+; X64-SSE41-LABEL: define i1 @length24_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length24_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length31(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length31(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length31(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length31(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length31(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length31(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length31(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length31(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length31(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length31(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length31_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length31_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length31_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length31_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length31_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length31_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length31_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP7]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP7]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX-LABEL: length31_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [943142453,842084409,909456435,809056311]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length32(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length32(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length32(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length32(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length32(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length32(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length32(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512-LABEL: length32_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length32_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length32_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length32_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length32_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512-LABEL: length32_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length48(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length48(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length48(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length48(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length48(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length48(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length48(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length48(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length48(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length48(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length48_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512-LABEL: length48_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-AVX512-NEXT:    vmovdqu 32(%rsi), %xmm2
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length48_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length48_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length48_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length48_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
+; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor 32(%rsi), %xmm2, %xmm1
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm4
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm5
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm4, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    korw %k0, %k1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm5, %zmm2, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k1, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length48_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X64-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-NEXT:    ret i1 [[TMP11]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP11]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512-LABEL: length48_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length63(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length63(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length63(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length63(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length63(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length63(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length63(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length63(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length63(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length63(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length63_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X64-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-NEXT:    ret i1 [[TMP22]]
+;
+; X64-SSE41-LABEL: define i1 @length63_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX1-LABEL: define i1 @length63_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length63_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512-LABEL: length63_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-AVX512-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length63_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm2
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rsi), %ymm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length63_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length63_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length63_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length63_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length63_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length63_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length63_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length63_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length63_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X64-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length63_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length63_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length63_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512-LABEL: length63_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length63_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [875770417,943142453,842084409,909456435,809056311,875770417,943142453,842084409]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length64(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length64(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length64(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length64(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length64(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length64(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length64(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X64-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-NEXT:    ret i1 [[TMP22]]
+;
+; X64-SSE41-LABEL: define i1 @length64_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512-LABEL: length64_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-AVX512-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
+; X64-AVX512-NEXT:    kortestw %k0, %k0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length64_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length64_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length64_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length64_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X64-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length64_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512-LABEL: length64_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-AVX512-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
+; X64-AVX512-NEXT:    kortestw %k0, %k0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length96(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length96(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length96(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length96(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length96(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length96(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length96(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length96(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length96(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length96(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length96_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $96, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length96_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX2-LABEL: define i1 @length96_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP16]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length96_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length96_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length96_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length96_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length96_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $96, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length96_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length96_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length96_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length96_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length127(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length127(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length127(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length127(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length127(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length127(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length127(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length127(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length127(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length127(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length127_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $127, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length127_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX1-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX2-LABEL: define i1 @length127_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512F-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length127_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length127_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length127_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length127_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length127_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $127, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length127_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length127_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length127_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length127_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length128_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $128, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length128_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX1-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX2-LABEL: define i1 @length128_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512F-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length128_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length128_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length128_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length128_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length128_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $128, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length128_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length128_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length128_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length128_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length192(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length192(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length192(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length192(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length192(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length192(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length192(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length192(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length192(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length192(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length192_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $192, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length192_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP16]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP16]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length192_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $192, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length192_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length192_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length192_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length192_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length255(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length255(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length255(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length255(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length255(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length255(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length255(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length255(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length255(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length255(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length255_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $255, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length255_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length255_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $255, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length255_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length255_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length255_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length255_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length256(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length256(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length256(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length256(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length256(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length256(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length256(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length256(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length256(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length256(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length256_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $256, %edx # imm = 0x100
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length256_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length256_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $256, %edx # imm = 0x100
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length256_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length256_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length256_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length256_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length384(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length384(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length384(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length384(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length384(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length384(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length384(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length384(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length384(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length384(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length384_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length384_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length384_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length384_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length511(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length511(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length511(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length511(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length511(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length511(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length511(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length511(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length511(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length511(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length511_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length511_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length511_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length511_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length512(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length512(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length512(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length512(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length512(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length512(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length512(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length512(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length512(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length512(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length512_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length512_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length512_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length512_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @huge_length(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @huge_length(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @huge_length(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @huge_length(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @huge_length(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @huge_length(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @huge_length(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @huge_length(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @huge_length(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @huge_length(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @huge_length_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @huge_length_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @huge_length_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @huge_length_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @huge_length_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @huge_length_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @huge_length_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @huge_length_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @huge_length_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @huge_length_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
+; X64-LABEL: define i32 @nonconst_length(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @nonconst_length(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @nonconst_length(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @nonconst_length(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @nonconst_length(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @nonconst_length(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @nonconst_length(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @nonconst_length(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @nonconst_length(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @nonconst_length(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
+; X64-LABEL: define i1 @nonconst_length_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @nonconst_length_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @nonconst_length_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @nonconst_length_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-nobuiltin.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-nobuiltin.ll
new file mode 100644
index 00000000000000..1ad91adb9e533e
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-nobuiltin.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown         < %s | FileCheck %s --check-prefix=X64_1LD
+; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown         < %s | FileCheck %s  --check-prefix=X64_2LD
+
+
+declare signext i32 @memcmp(ptr %src1, ptr %src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i32 @f1(ptr %src1, ptr %src2) {
+; X64-LABEL: define i32 @f1(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 0) #[[ATTR0:[0-9]+]]
+; X64-NEXT:    ret i32 [[RES]]
+;
+; X64_1LD-LABEL: define i32 @f1(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 0) #[[ATTR0:[0-9]+]]
+; X64_1LD-NEXT:    ret i32 [[RES]]
+;
+; X64_2LD-LABEL: define i32 @f1(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 0) #[[ATTR0:[0-9]+]]
+; X64_2LD-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 0) nobuiltin
+  ret i32 %res
+}
+
+; Check a case where the result is used as an integer.
+define i32 @f2(ptr %src1, ptr %src2) {
+; X64-LABEL: define i32 @f2(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 2) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[RES]]
+;
+; X64_1LD-LABEL: define i32 @f2(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 2) #[[ATTR0]]
+; X64_1LD-NEXT:    ret i32 [[RES]]
+;
+; X64_2LD-LABEL: define i32 @f2(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 2) #[[ATTR0]]
+; X64_2LD-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 2) nobuiltin
+  ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(ptr %src1, ptr %src2, ptr %dest) {
+; X64-LABEL: define void @f3(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 3) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RES]], 0
+; X64-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64:       store:
+; X64-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64-NEXT:    br label [[EXIT]]
+; X64:       exit:
+; X64-NEXT:    ret void
+;
+; X64_1LD-LABEL: define void @f3(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 3) #[[ATTR0]]
+; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RES]], 0
+; X64_1LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_1LD:       store:
+; X64_1LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_1LD-NEXT:    br label [[EXIT]]
+; X64_1LD:       exit:
+; X64_1LD-NEXT:    ret void
+;
+; X64_2LD-LABEL: define void @f3(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 3) #[[ATTR0]]
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RES]], 0
+; X64_2LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_2LD:       store:
+; X64_2LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_2LD-NEXT:    br label [[EXIT]]
+; X64_2LD:       exit:
+; X64_2LD-NEXT:    ret void
+;
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 3) nobuiltin
+  %cmp = icmp eq i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, ptr %dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(ptr %src1, ptr %src2, ptr %dest) {
+; X64-LABEL: define void @f4(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64-NEXT:  entry:
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 4) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[RES]], 0
+; X64-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64:       store:
+; X64-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64-NEXT:    br label [[EXIT]]
+; X64:       exit:
+; X64-NEXT:    ret void
+;
+; X64_1LD-LABEL: define void @f4(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_1LD-NEXT:  entry:
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 4) #[[ATTR0]]
+; X64_1LD-NEXT:    [[CMP:%.*]] = icmp ne i32 [[RES]], 0
+; X64_1LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_1LD:       store:
+; X64_1LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_1LD-NEXT:    br label [[EXIT]]
+; X64_1LD:       exit:
+; X64_1LD-NEXT:    ret void
+;
+; X64_2LD-LABEL: define void @f4(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_2LD-NEXT:  entry:
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 4) #[[ATTR0]]
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp ne i32 [[RES]], 0
+; X64_2LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_2LD:       store:
+; X64_2LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_2LD-NEXT:    br label [[EXIT]]
+; X64_2LD:       exit:
+; X64_2LD-NEXT:    ret void
+;
+entry:
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 4) nobuiltin
+  %cmp = icmp ne i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, ptr %dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(ptr %src1, ptr %src2, ptr %dest) {
+; X64-LABEL: define void @f5(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64-NEXT:  entry:
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 5) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[RES]], 0
+; X64-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64:       store:
+; X64-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64-NEXT:    br label [[EXIT]]
+; X64:       exit:
+; X64-NEXT:    ret void
+;
+; X64_1LD-LABEL: define void @f5(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_1LD-NEXT:  entry:
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 5) #[[ATTR0]]
+; X64_1LD-NEXT:    [[CMP:%.*]] = icmp slt i32 [[RES]], 0
+; X64_1LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_1LD:       store:
+; X64_1LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_1LD-NEXT:    br label [[EXIT]]
+; X64_1LD:       exit:
+; X64_1LD-NEXT:    ret void
+;
+; X64_2LD-LABEL: define void @f5(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_2LD-NEXT:  entry:
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 5) #[[ATTR0]]
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp slt i32 [[RES]], 0
+; X64_2LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_2LD:       store:
+; X64_2LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_2LD-NEXT:    br label [[EXIT]]
+; X64_2LD:       exit:
+; X64_2LD-NEXT:    ret void
+;
+entry:
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 5) nobuiltin
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, ptr %dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(ptr %src1, ptr %src2, ptr %dest) {
+; X64-LABEL: define void @f6(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64-NEXT:  entry:
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 6) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[RES]], 0
+; X64-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64:       store:
+; X64-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64-NEXT:    br label [[EXIT]]
+; X64:       exit:
+; X64-NEXT:    ret void
+;
+; X64_1LD-LABEL: define void @f6(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_1LD-NEXT:  entry:
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 6) #[[ATTR0]]
+; X64_1LD-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[RES]], 0
+; X64_1LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_1LD:       store:
+; X64_1LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_1LD-NEXT:    br label [[EXIT]]
+; X64_1LD:       exit:
+; X64_1LD-NEXT:    ret void
+;
+; X64_2LD-LABEL: define void @f6(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_2LD-NEXT:  entry:
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 6) #[[ATTR0]]
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[RES]], 0
+; X64_2LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_2LD:       store:
+; X64_2LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_2LD-NEXT:    br label [[EXIT]]
+; X64_2LD:       exit:
+; X64_2LD-NEXT:    ret void
+;
+entry:
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 6) nobuiltin
+  %cmp = icmp sgt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, ptr %dest
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize-x32.ll
new file mode 100644
index 00000000000000..b36c0db432820d
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize-x32.ll
@@ -0,0 +1,870 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=cmov < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=X86-SSE2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+declare dso_local i32 @bcmp(ptr, ptr, i32)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind optsize {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind optsize {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind optsize {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind optsize {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind optsize {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind optsize {
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @bcmp_length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE2-LABEL: define i32 @bcmp_length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize.ll
new file mode 100644
index 00000000000000..cb6c5e6da1c790
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize.ll
@@ -0,0 +1,1414 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=X64
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s --check-prefix=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+declare dso_local i32 @bcmp(ptr, ptr, i64)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind optsize {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind optsize {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind optsize {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX1-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX1-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind optsize {
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR3:[0-9]+]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR4:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR4:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
+; X64-SSE2-LABEL: length24_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind optsize {
+; X64-SSE2-LABEL: length24_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR3]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR4]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR4]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind optsize {
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR3]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR4]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR4]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
+; X64-SSE2-LABEL: length64_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR3]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind optsize {
+; X64-SSE2-LABEL: length64_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR3]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @bcmp_length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX1-LABEL: define i32 @bcmp_length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX2-LABEL: define i32 @bcmp_length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso-x32.ll
new file mode 100644
index 00000000000000..a8b054cd20e270
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso-x32.ll
@@ -0,0 +1,887 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=cmov < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s  --check-prefix=X86-SSE2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+declare dso_local i32 @bcmp(ptr, ptr, i32)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind !prof !14 {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @bcmp_length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE2-LABEL: define i32 @bcmp_length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i32 10000}
+!4 = !{!"MaxCount", i32 10}
+!5 = !{!"MaxInternalCount", i32 1}
+!6 = !{!"MaxFunctionCount", i32 1000}
+!7 = !{!"NumCounts", i32 3}
+!8 = !{!"NumFunctions", i32 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i32 100, i32 1}
+!12 = !{i32 999000, i32 100, i32 1}
+!13 = !{i32 999999, i32 1, i32 2}
+!14 = !{!"function_entry_count", i32 0}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso.ll
new file mode 100644
index 00000000000000..1507cbdc4e86ec
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso.ll
@@ -0,0 +1,1347 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=X64
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s --check-prefix=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+declare dso_local i32 @bcmp(ptr, ptr, i64)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind !prof !14 {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX1-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX1-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR4:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR4:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR4]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR4]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR4]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR4]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
+;
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
+;
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @bcmp_length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX1-LABEL: define i32 @bcmp_length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX2-LABEL: define i32 @bcmp_length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32-2.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32-2.ll
new file mode 100644
index 00000000000000..8c86c110c7bb2b
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32-2.ll
@@ -0,0 +1,4813 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=cmov < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse < %s | FileCheck %s --check-prefix=X86-SSE1
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=X86-SSE2
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=X86-SSE41
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length0(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-NEXT:    ret i32 0
+;
+; X86-SSE1-LABEL: define i32 @length0(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE1-NEXT:    ret i32 0
+;
+; X86-SSE2-LABEL: define i32 @length0(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE2-NEXT:    ret i32 0
+;
+; X86-SSE41-LABEL: define i32 @length0(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE41-NEXT:    ret i32 0
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length0_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    ret i1 true
+;
+; X86-SSE1-LABEL: define i1 @length0_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    ret i1 true
+;
+; X86-SSE2-LABEL: define i1 @length0_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    ret i1 true
+;
+; X86-SSE41-LABEL: define i1 @length0_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    ret i1 true
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length0_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    ret i1 false
+;
+; X86-SSE1-LABEL: define i1 @length0_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    ret i1 false
+;
+; X86-SSE2-LABEL: define i1 @length0_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    ret i1 false
+;
+; X86-SSE41-LABEL: define i1 @length0_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    ret i1 false
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE1-LABEL: define i32 @length2(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i32 @length2(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length2_const(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE1-LABEL: define i32 @length2_const(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE1-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE2-LABEL: define i32 @length2_const(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE2-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE41-LABEL: define i32 @length2_const(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE41-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_gt_const(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_gt_const(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_gt_const(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_gt_const(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length3(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length3(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE1-LABEL: define i1 @length3_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE41-LABEL: define i1 @length3_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE1-LABEL: define i32 @length4(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE1-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE2-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE41-LABEL: define i32 @length4(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE1-LABEL: define i1 @length4_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE41-LABEL: define i1 @length4_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE1-LABEL: define i1 @length4_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE2-LABEL: define i1 @length4_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE41-LABEL: define i1 @length4_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE1-LABEL: define i1 @length4_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE2-LABEL: define i1 @length4_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE41-LABEL: define i1 @length4_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length4_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length4_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length5(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length5(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE1-LABEL: define i1 @length5_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE41-LABEL: define i1 @length5_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length5_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length5_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length5_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length5_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length7(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length7(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length7(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length7(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length7_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length7_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length7_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length7_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length7_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE1-LABEL: define i1 @length7_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE2-LABEL: define i1 @length7_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE41-LABEL: define i1 @length7_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP10]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length8(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length8(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length8_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length8_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE1-LABEL: define i1 @length8_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length8_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length9_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9) #[[ATTR5:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length9_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9) #[[ATTR5:[0-9]+]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length9_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9) #[[ATTR5:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length9_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9) #[[ATTR5:[0-9]+]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length10_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length10_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length10_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length10_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length11_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length11_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length11_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length11_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length12_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length12_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length12(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length12(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length13_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length13_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length13_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length13_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length14_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length14_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length14_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length14_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length15(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length15(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length15(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length15(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length15_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length15_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length15_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length15_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15_const(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length15_const(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length15_const(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length15_const(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length15_const(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length15_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length15_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length15_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length15_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length15_gt_const(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp sgt i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length15_gt_const(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp sgt i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length15_gt_const(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp sgt i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length15_gt_const(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length16(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length16(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE41-LABEL: define i1 @length16_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length16_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length16_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length16_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length16_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length16_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length16_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length24(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length24(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length24_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length24_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length24_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length24_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length31(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length31(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length31(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length31(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length31_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length31_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length31_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length31_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length31_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 31) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length32(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length32(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length32_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length32_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length32_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length48(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length48(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length48(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length48(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-LABEL: define i1 @length48_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length48_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length63(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length63(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length63(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length63(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length63_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length63_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length63_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length63_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length64(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length64(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length64_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length64_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length96(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length96(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length96(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length96(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length96_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length96_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length96_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length96_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length127(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length127(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length127(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length127(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length127_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length127_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length127_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length127_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length128_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length128_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length128_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length128_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length192(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length192(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length192(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length192(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length192_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length192_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length192_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length192_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length255(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length255(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length255(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length255(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length255_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length255_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length255_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length255_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length256(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length256(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length256(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length256(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length256_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length256_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length256_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length256_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length384(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length384(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length384(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length384(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length384_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length384_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length384_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length384_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length511(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length511(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length511(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length511(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length511_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length511_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length511_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length511_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length512(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length512(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length512(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length512(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length512_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length512_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length512_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length512_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @huge_length(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @huge_length(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @huge_length(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @huge_length(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @huge_length_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @huge_length_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @huge_length_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @huge_length_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
+; X86-LABEL: define i32 @nonconst_length(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @nonconst_length(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @nonconst_length(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @nonconst_length(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
+; X86-LABEL: define i1 @nonconst_length_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index d71ae8be19b668..5a0f4db363536d 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -1,64 +1,66 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -expand-memcmp -mtriple=i686-unknown-unknown   -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown   -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32
 
 declare i32 @memcmp(ptr nocapture, ptr nocapture, i32)
 
 define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    ret i32 [[TMP9]]
+; X32-LABEL: define i32 @cmp2(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X32-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X32-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X32-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
   ret i32 %call
 }
 
 define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
-; X32-LABEL: @cmp2_align2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 2
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    ret i32 [[TMP9]]
+; X32-LABEL: define i32 @cmp2_align2(
+; X32-SAME: ptr nocapture readonly align 2 [[X:%.*]], ptr nocapture readonly align 2 [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 2
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 2
+; X32-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X32-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X32-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X32-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
   ret i32 %call
 }
 
 define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp3(
+; X32-LABEL: define i32 @cmp3(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X32-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X32-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X32-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X32-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X32-NEXT:    br label [[ENDBLOCK]]
 ; X32:       endblock:
-; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 3)
@@ -66,47 +68,49 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp4(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X32-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X32-NEXT:    ret i32 [[TMP11]]
+; X32-LABEL: define i32 @cmp4(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X32-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 4)
   ret i32 %call
 }
 
 define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp5(
+; X32-LABEL: define i32 @cmp5(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X32-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X32-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X32-NEXT:    br label [[ENDBLOCK]]
 ; X32:       endblock:
-; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 5)
@@ -114,32 +118,33 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp6(
+; X32-LABEL: define i32 @cmp6(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X32-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; X32-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
-; X32-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
-; X32-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
-; X32-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; X32-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; X32-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i32
+; X32-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i32
+; X32-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
@@ -149,30 +154,31 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp7(
+; X32-LABEL: define i32 @cmp7(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X32-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X32-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X32-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
@@ -182,30 +188,31 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp8(
+; X32-LABEL: define i32 @cmp8(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X32-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X32-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X32-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
@@ -215,8 +222,9 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp9(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 9)
+; X32-LABEL: define i32 @cmp9(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 9)
@@ -224,8 +232,9 @@ define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp10(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 10)
+; X32-LABEL: define i32 @cmp10(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 10)
@@ -233,8 +242,9 @@ define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp11(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 11)
+; X32-LABEL: define i32 @cmp11(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 11)
@@ -242,8 +252,9 @@ define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp12(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 12)
+; X32-LABEL: define i32 @cmp12(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 12)
@@ -251,8 +262,9 @@ define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp13(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 13)
+; X32-LABEL: define i32 @cmp13(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 13)
@@ -260,8 +272,9 @@ define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp14(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 14)
+; X32-LABEL: define i32 @cmp14(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 14)
@@ -269,8 +282,9 @@ define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp15(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 15)
+; X32-LABEL: define i32 @cmp15(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 15)
@@ -278,8 +292,9 @@ define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp16(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 16)
+; X32-LABEL: define i32 @cmp16(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16)
@@ -287,12 +302,13 @@ define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X32-LABEL: define i32 @cmp_eq2(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -303,21 +319,22 @@ define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq3(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X32-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i16
-; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i16
-; X32-NEXT:    [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-LABEL: define i32 @cmp_eq3(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X32-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X32-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X32-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -328,12 +345,13 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq4(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X32-LABEL: define i32 @cmp_eq4(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -344,21 +362,22 @@ define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq5(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-LABEL: define i32 @cmp_eq5(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -369,21 +388,22 @@ define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq6(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X32-LABEL: define i32 @cmp_eq6(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -394,21 +414,22 @@ define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture readonly align 4 %y)  {
-; X32-LABEL: @cmp_eq6_align4(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 4
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 4
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 4
-; X32-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 4
-; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X32-LABEL: define i32 @cmp_eq6_align4(
+; X32-SAME: ptr nocapture readonly align 4 [[X:%.*]], ptr nocapture readonly align 4 [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 4
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; X32-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -419,19 +440,20 @@ define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture read
 }
 
 define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq7(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-LABEL: define i32 @cmp_eq7(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X32-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X32-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -442,19 +464,20 @@ define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq8(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-LABEL: define i32 @cmp_eq8(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X32-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X32-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -465,8 +488,9 @@ define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq9(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 9)
+; X32-LABEL: define i32 @cmp_eq9(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -478,8 +502,9 @@ define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq10(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 10)
+; X32-LABEL: define i32 @cmp_eq10(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -491,8 +516,9 @@ define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq11(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 11)
+; X32-LABEL: define i32 @cmp_eq11(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -504,8 +530,9 @@ define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq12(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 12)
+; X32-LABEL: define i32 @cmp_eq12(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -517,8 +544,9 @@ define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq13(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 13)
+; X32-LABEL: define i32 @cmp_eq13(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -530,8 +558,9 @@ define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq14(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 14)
+; X32-LABEL: define i32 @cmp_eq14(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -543,8 +572,9 @@ define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq15(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 15)
+; X32-LABEL: define i32 @cmp_eq15(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -556,8 +586,9 @@ define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq16(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 16)
+; X32-LABEL: define i32 @cmp_eq16(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index f686e29975564f..99100aad3ee84a 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -1,66 +1,67 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
-; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
 ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD
 
 declare i32 @memcmp(ptr nocapture, ptr nocapture, i64)
 
 define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp2(
-; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    ret i32 [[TMP9]]
+; X64-LABEL: define i32 @cmp2(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
   ret i32 %call
 }
 
 define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
-; X64-LABEL: @cmp2_align2(
-; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 2
-; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    ret i32 [[TMP9]]
+; X64-LABEL: define i32 @cmp2_align2(
+; X64-SAME: ptr nocapture readonly align 2 [[X:%.*]], ptr nocapture readonly align 2 [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 2
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 2
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
   ret i32 %call
 }
 
 define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp3(
+; X64-LABEL: define i32 @cmp3(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X64-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
@@ -68,47 +69,49 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp4(
-; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X64-NEXT:    ret i32 [[TMP11]]
+; X64-LABEL: define i32 @cmp4(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
   ret i32 %call
 }
 
 define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp5(
+; X64-LABEL: define i32 @cmp5(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
@@ -116,32 +119,33 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp6(
+; X64-LABEL: define i32 @cmp6(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
-; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
-; X64-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
-; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i32
+; X64-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i32
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -151,30 +155,31 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp7(
+; X64-LABEL: define i32 @cmp7(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -184,47 +189,49 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp8(
-; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X64-NEXT:    ret i32 [[TMP11]]
+; X64-LABEL: define i32 @cmp8(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
   ret i32 %call
 }
 
 define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp9(
+; X64-LABEL: define i32 @cmp9(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 9)
@@ -232,32 +239,33 @@ define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp10(
+; X64-LABEL: define i32 @cmp10(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i64
-; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i64
-; X64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
-; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -267,30 +275,31 @@ define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp11(
+; X64-LABEL: define i32 @cmp11(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -300,32 +309,33 @@ define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp12(
+; X64-LABEL: define i32 @cmp12(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
-; X64-NEXT:    [[TMP19]] = zext i32 [[TMP17]] to i64
-; X64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
-; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -335,30 +345,31 @@ define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp13(
+; X64-LABEL: define i32 @cmp13(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -368,30 +379,31 @@ define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp14(
+; X64-LABEL: define i32 @cmp14(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -401,30 +413,31 @@ define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp15(
+; X64-LABEL: define i32 @cmp15(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -434,30 +447,31 @@ define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp16(
+; X64-LABEL: define i32 @cmp16(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -467,12 +481,13 @@ define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp_eq2(
-; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-LABEL: define i32 @cmp_eq2(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
@@ -483,43 +498,45 @@ define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq3(
+; X64_1LD-LABEL: define i32 @cmp_eq3(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X64_1LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq3(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X64_2LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i16
-; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i16
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq3(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -530,12 +547,13 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp_eq4(
-; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-LABEL: define i32 @cmp_eq4(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
@@ -546,43 +564,45 @@ define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq5(
+; X64_1LD-LABEL: define i32 @cmp_eq5(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq5(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i32
-; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq5(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -593,43 +613,45 @@ define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq6(
+; X64_1LD-LABEL: define i32 @cmp_eq6(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq6(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq6(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -640,43 +662,45 @@ define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture readonly align 4 %y)  {
-; X64_1LD-LABEL: @cmp_eq6_align4(
+; X64_1LD-LABEL: define i32 @cmp_eq6_align4(
+; X64_1LD-SAME: ptr nocapture readonly align 4 [[X:%.*]], ptr nocapture readonly align 4 [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 4
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 4
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 4
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 4
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 4
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq6_align4(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 4
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 4
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 4
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 4
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq6_align4(
+; X64_2LD-SAME: ptr nocapture readonly align 4 [[X:%.*]], ptr nocapture readonly align 4 [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 4
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -687,41 +711,43 @@ define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture read
 }
 
 define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq7(
+; X64_1LD-LABEL: define i32 @cmp_eq7(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq7(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq7(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -732,12 +758,13 @@ define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp_eq8(
-; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-LABEL: define i32 @cmp_eq8(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
@@ -748,43 +775,45 @@ define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq9(
+; X64_1LD-LABEL: define i32 @cmp_eq9(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq9(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i64
-; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i64
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq9(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -795,43 +824,45 @@ define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq10(
+; X64_1LD-LABEL: define i32 @cmp_eq10(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq10(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i64
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i64
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq10(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -842,41 +873,43 @@ define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq11(
+; X64_1LD-LABEL: define i32 @cmp_eq11(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq11(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq11(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -887,43 +920,45 @@ define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq12(
+; X64_1LD-LABEL: define i32 @cmp_eq12(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq12(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP10]] to i64
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq12(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -934,41 +969,43 @@ define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq13(
+; X64_1LD-LABEL: define i32 @cmp_eq13(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq13(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq13(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -979,41 +1016,43 @@ define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq14(
+; X64_1LD-LABEL: define i32 @cmp_eq14(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq14(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq14(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -1024,41 +1063,43 @@ define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq15(
+; X64_1LD-LABEL: define i32 @cmp_eq15(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq15(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq15(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -1069,12 +1110,13 @@ define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp_eq16(
-; X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-LABEL: define i32 @cmp_eq16(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg b/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg
new file mode 100644
index 00000000000000..dfb347e640e144
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
\ No newline at end of file
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll
new file mode 100644
index 00000000000000..a62b17de08ee43
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -O2 -S -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+ at s1 = internal global ptr @.str, align 8
+ at s2 = internal global ptr @.str.1, align 8
+ at s3 = internal global ptr @.str.2, align 8
+ at .str = private unnamed_addr constant [9 x i8] c"01234000\00", align 1
+ at .str.1 = private unnamed_addr constant [9 x i8] c"0123!000\00", align 1
+ at .str.2 = private unnamed_addr constant [9 x i8] c"0123?000\00", align 1
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @memcmp_same_prefix_consts(ptr noundef %x) #0 {
+; CHECK-LABEL: define dso_local noundef i32 @memcmp_same_prefix_consts(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[TMP0]], 858927408
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 52
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[TMP1]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[IF_END8:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP4]], 33
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP1]]
+; CHECK-NEXT:    [[DOTNOT3:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT3]], label [[IF_END8]], label [[IF_THEN3:%.*]]
+; CHECK:       if.then3:
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP4]], 63
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP1]]
+; CHECK-NEXT:    [[DOTNOT4:%.*]] = icmp eq i32 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT4]], label [[IF_END8]], label [[RETURN:%.*]]
+; CHECK:       if.end8:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 0, [[IF_END8]] ], [ 42, [[IF_THEN3]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca ptr, align 8
+  store ptr %x, ptr %x.addr, align 8
+  %0 = load ptr, ptr %x.addr, align 8
+  %1 = load ptr, ptr @s1, align 8
+  %call = call i32 @memcmp(ptr noundef %0, ptr noundef %1, i64 noundef 5) #2
+  %cmp = icmp ne i32 %call, 0
+  br i1 %cmp, label %if.then, label %if.end8
+
+if.then:                                          ; preds = %entry
+  %2 = load ptr, ptr %x.addr, align 8
+  %3 = load ptr, ptr @s2, align 8
+  %call1 = call i32 @memcmp(ptr noundef %2, ptr noundef %3, i64 noundef 5) #2
+  %cmp2 = icmp ne i32 %call1, 0
+  br i1 %cmp2, label %if.then3, label %if.end7
+
+if.then3:                                         ; preds = %if.then
+  %4 = load ptr, ptr %x.addr, align 8
+  %5 = load ptr, ptr @s3, align 8
+  %call4 = call i32 @memcmp(ptr noundef %4, ptr noundef %5, i64 noundef 5) #2
+  %cmp5 = icmp ne i32 %call4, 0
+  br i1 %cmp5, label %if.then6, label %if.end
+
+if.then6:                                         ; preds = %if.then3
+  store i32 42, ptr %retval, align 4
+  br label %return
+
+if.end:                                           ; preds = %if.then3
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.end, %if.then
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.end7, %entry
+  store i32 0, ptr %retval, align 4
+  br label %return
+
+return:                                           ; preds = %if.end8, %if.then6
+  %6 = load i32, ptr %retval, align 4
+  ret i32 %6
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @memcmp(ptr noundef, ptr noundef, i64 noundef) #1
+
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/memcmp-mergeexpand.ll b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-mergeexpand.ll
new file mode 100644
index 00000000000000..2de1f8576f631f
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-mergeexpand.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S --passes=mergeicmps,expand-memcmp -mtriple=i686-unknown-linux < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S --passes=mergeicmps,expand-memcmp -mtriple=x86_64-unknown-linux < %s | FileCheck %s --check-prefix=X64
+
+; This tests interaction between MergeICmp and ExpandMemCmp.
+
+%"struct.std::pair" = type { i32, i32 }
+
+define zeroext i1 @opeq1(
+; X86-LABEL: define zeroext i1 @opeq1(
+; X86-SAME: ptr nocapture readonly dereferenceable(8) [[A:%.*]], ptr nocapture readonly dereferenceable(8) [[B:%.*]]) local_unnamed_addr {
+; X86-NEXT:  "entry+land.rhs.i":
+; X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 1
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP0]], [[TMP1]]
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = or i32 [[TMP2]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; X86-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X86-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
+; X86-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+; X86:       opeq1.exit:
+; X86-NEXT:    ret i1 [[TMP11]]
+;
+; X64-LABEL: define zeroext i1 @opeq1(
+; X64-SAME: ptr nocapture readonly dereferenceable(8) [[A:%.*]], ptr nocapture readonly dereferenceable(8) [[B:%.*]]) local_unnamed_addr {
+; X64-NEXT:  "entry+land.rhs.i":
+; X64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 1
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], [[TMP1]]
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+; X64:       opeq1.exit:
+; X64-NEXT:    ret i1 [[TMP4]]
+;
+  %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
+  %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
+entry:
+  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+  %0 = load i32, i32* %first.i, align 4
+  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+  %1 = load i32, i32* %first1.i, align 4
+  %cmp.i = icmp eq i32 %0, %1
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+  %2 = load i32, i32* %second.i, align 4
+  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+  %3 = load i32, i32* %second2.i, align 4
+  %cmp3.i = icmp eq i32 %2, %3
+  br label %opeq1.exit
+
+opeq1.exit:
+  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
+  ret i1 %4
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll b/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
new file mode 100644
index 00000000000000..68dfacac5b5e12
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
@@ -0,0 +1,856 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt  -O2 -S -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare i32 @memcmp(ptr, ptr, i64)
+
+declare i32 @bcmp(ptr, ptr, i64)
+
+; Function Attrs: nounwind
+define i32 @length0(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define noundef i32 @length0(
+; CHECK-SAME: ptr nocapture readnone [[X:%.*]], ptr nocapture readnone [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length0_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define noundef i1 @length0_eq(
+; CHECK-SAME: ptr nocapture readnone [[X:%.*]], ptr nocapture readnone [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    ret i1 true
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length0_lt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define noundef i1 @length0_lt(
+; CHECK-SAME: ptr nocapture readnone [[X:%.*]], ptr nocapture readnone [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    ret i1 false
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) #0
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length2(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length2(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length2_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length2_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length2_lt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length2_lt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length2_gt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length2_gt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i16 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length2_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length2_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #1
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length3(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length3(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i16 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i16 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP13]], [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length3_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length3_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i16
+; CHECK-NEXT:    [[TMP10:%.*]] = or i16 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length4(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length4(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i32 [[DOTNEG]], [[TMP7]]
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length4_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length4_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length4_lt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length4_lt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length4_gt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length4_gt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length4_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length4_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 875770417
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length5(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length5(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP13]], [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length5_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length5_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length5_lt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length5_lt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i8 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i1 [ [[TMP10]], [[LOADBB1]] ], [ [[TMP5]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i1 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) #0
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length7_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length7_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length8(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length8(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i32 [[DOTNEG]], [[TMP7]]
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length8_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length8_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length8_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length8_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length9_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length9_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length10_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length10_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i16 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length11_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length11_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length12_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length12_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length12(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length12(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP2]], [[LOADBB:%.*]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP3]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length13_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length13_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length14_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length14_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length15_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length15_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length16(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length16(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP2]], [[LOADBB:%.*]] ], [ [[TMP11:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP3]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11]] = tail call i64 @llvm.bswap.i64(i64 [[TMP9]])
+; CHECK-NEXT:    [[TMP12]] = tail call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length16_eq(ptr %x, ptr %y) #0 {
+; CHECK-LABEL: define i1 @length16_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) #0
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+; Function Attrs: nounwind
+define i1 @length16_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length16_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i128 [[TMP1]], 70720121592765328381466889075544961328
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length24(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length24(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(24) [[X]], ptr noundef nonnull dereferenceable(24) [[Y]], i64 24) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length24_eq(ptr %x, ptr %y) #0 {
+; CHECK-LABEL: define i1 @length24_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP8]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = or i128 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i128 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) #0
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+; Function Attrs: nounwind
+define i1 @length24_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length24_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3689065127958034230
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i64 [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    ret i1 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length32(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length32(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(32) [[X]], ptr noundef nonnull dereferenceable(32) [[Y]], i64 32) #[[ATTR5]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length32_eq(ptr %x, ptr %y) #0 {
+; CHECK-LABEL: define i1 @length32_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP5:%.*]] = load i128, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i128 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) #0
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+; Function Attrs: nounwind
+define i1 @length32_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length32_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i128 [[TMP3]], 65382562593882267225249597816672106294
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length64(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length64(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) [[Y]], i64 64) #[[ATTR5]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length64_eq(ptr %x, ptr %y) #0 {
+; CHECK-LABEL: define i1 @length64_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) [[Y]], i64 64) #[[ATTR5]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) #0
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+; Function Attrs: nounwind
+define i1 @length64_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length64_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) @.str, i64 64) #[[ATTR5]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @huge_length(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @huge_length(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(9223372036854775807) [[X]], ptr noundef nonnull dereferenceable(9223372036854775807) [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @huge_length_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @huge_length_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(9223372036854775807) [[X]], ptr noundef nonnull dereferenceable(9223372036854775807) [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) #0 {
+; CHECK-LABEL: define i32 @nonconst_length(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) #0 {
+; CHECK-LABEL: define i1 @nonconst_length_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @bcmp_length2(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @bcmp_length2(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nobuiltin nounwind }
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index b6068513d23063..3e3a40f217e63e 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -422,7 +422,6 @@ int main(int argc, char **argv) {
   // supported.
   initializeExpandLargeDivRemLegacyPassPass(Registry);
   initializeExpandLargeFpConvertLegacyPassPass(Registry);
-  initializeExpandMemCmpLegacyPassPass(Registry);
   initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
   initializeSelectOptimizePass(Registry);
   initializeCallBrPreparePass(Registry);
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 047f6583ec4e88..d618bd5bfab9aa 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -71,7 +71,6 @@ static_library("CodeGen") {
     "ExecutionDomainFix.cpp",
     "ExpandLargeDivRem.cpp",
     "ExpandLargeFpConvert.cpp",
-    "ExpandMemCmp.cpp",
     "ExpandPostRAPseudos.cpp",
     "ExpandReductions.cpp",
     "ExpandVectorPredication.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
index bed26df94e2c45..876f5fece1128e 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
@@ -23,6 +23,7 @@ static_library("Scalar") {
     "DeadStoreElimination.cpp",
     "DivRemPairs.cpp",
     "EarlyCSE.cpp",
+    "ExpandMemCmp.cpp",
     "FlattenCFGPass.cpp",
     "Float2Int.cpp",
     "GVN.cpp",



More information about the llvm-commits mailing list