[llvm] [ExpandMemCmp] Optimize ExpandMemCmp to reduce instruction count on x86 (PR #69609)

Igor Kirillov via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 19 09:01:30 PDT 2023


https://github.com/igogo-x86 updated https://github.com/llvm/llvm-project/pull/69609

>From 67917cd9a679e7d0247b790d01cf8d13acef12bd Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Thu, 19 Oct 2023 15:12:45 +0000
Subject: [PATCH 1/2] [ExpandMemCmp] Optimize ExpandMemCmp to reduce
 instruction count on x86

Refactored the sequence of operations in MemCmpExpansion to zero-extend
before byte-swapping. This change enables the generation of fewer instructions
for x86, thereby improving code efficiency.
---
 llvm/lib/CodeGen/ExpandMemCmp.cpp             |   31 +-
 .../CodeGen/X86/memcmp-more-load-pairs-x32.ll |   26 +-
 .../CodeGen/X86/memcmp-more-load-pairs.ll     |   22 +-
 llvm/test/CodeGen/X86/memcmp-optsize-x32.ll   |   10 +-
 llvm/test/CodeGen/X86/memcmp-optsize.ll       |   10 +-
 llvm/test/CodeGen/X86/memcmp-pgso-x32.ll      |   10 +-
 llvm/test/CodeGen/X86/memcmp-pgso.ll          |   10 +-
 llvm/test/CodeGen/X86/memcmp-x32.ll           |   36 +-
 llvm/test/CodeGen/X86/memcmp.ll               |   32 +-
 .../Transforms/ExpandMemCmp/X86/memcmp-x32.ll |  392 +++---
 .../Transforms/ExpandMemCmp/X86/memcmp.ll     | 1058 ++++++++---------
 11 files changed, 801 insertions(+), 836 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 911ebd41afc5b91..40fbe877ab7ac09 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -307,19 +307,20 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
   if (!Rhs)
     Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
 
+  // Zero extend if required.
+  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+    Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
+    Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
+  }
+
   // Swap bytes if required.
   if (NeedsBSwap) {
+    Type *BSwapType = CmpSizeType ? CmpSizeType : LoadSizeType;
     Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
+                                                Intrinsic::bswap, BSwapType);
     Lhs = Builder.CreateCall(Bswap, Lhs);
     Rhs = Builder.CreateCall(Bswap, Rhs);
   }
-
-  // Zero extend if required.
-  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
-    Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
-    Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
-  }
   return {Lhs, Rhs};
 }
 
@@ -694,10 +695,10 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
 ///  %17 = getelementptr i32, i32* %15, i32 2
 ///  %18 = load i32, i32* %16
 ///  %19 = load i32, i32* %17
-///  %20 = call i32 @llvm.bswap.i32(i32 %18)
-///  %21 = call i32 @llvm.bswap.i32(i32 %19)
-///  %22 = zext i32 %20 to i64
-///  %23 = zext i32 %21 to i64
+///  %20 = zext i32 %18 to i64
+///  %21 = zext i32 %19 to i64
+///  %22 = call i64 @llvm.bswap.i64(i64 %20)
+///  %23 = call i64 @llvm.bswap.i64(i64 %21)
 ///  %24 = sub i64 %22, %23
 ///  %25 = icmp ne i64 %24, 0
 ///  br i1 %25, label %res_block, label %loadbb2
@@ -710,10 +711,10 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
 ///  %31 = getelementptr i16, i16* %29, i16 6
 ///  %32 = load i16, i16* %30
 ///  %33 = load i16, i16* %31
-///  %34 = call i16 @llvm.bswap.i16(i16 %32)
-///  %35 = call i16 @llvm.bswap.i16(i16 %33)
-///  %36 = zext i16 %34 to i64
-///  %37 = zext i16 %35 to i64
+///  %34 = zext i16 %32 to i64
+///  %35 = zext i16 %33 to i64
+///  %36 = call i64 @llvm.bswap.i64(i16 %34)
+///  %37 = call i64 @llvm.bswap.i64(i16 %35)
 ///  %38 = sub i64 %36, %37
 ///  %39 = icmp ne i64 %38, 0
 ///  br i1 %39, label %res_block, label %loadbb3
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index c0f8f86e6e8b107..a89571656e46951 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -44,14 +44,12 @@ define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
 define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -75,14 +73,12 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
 define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_lt:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -99,10 +95,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl (%ecx), %ecx
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    setg %al
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 56d06021867fa15..1f07ba39ecef9fe 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -52,10 +52,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -79,10 +77,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -97,10 +93,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
@@ -511,8 +505,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    je .LBB29_3
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 762691151f4bd3b..8efd4fca91a9972 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -13,14 +13,12 @@ declare dso_local i32 @bcmp(ptr, ptr, i32)
 define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index c0c7b98d471cd46..a8df0ac1354f893 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -16,10 +16,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -251,8 +249,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    je .LBB15_3
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index cb45fd3ebb9068c..b486eebd54b4a37 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -13,14 +13,12 @@ declare dso_local i32 @bcmp(ptr, ptr, i32)
 define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 720344a22e43b5c..afb57c8101b8221 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -16,10 +16,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -251,8 +249,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    je .LBB15_3
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
index ab439b32f2f1b20..f5b67ab45f7255c 100644
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-x32.ll
@@ -43,14 +43,12 @@ define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
 define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
@@ -62,9 +60,8 @@ define i32 @length2_const(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    addl $-825360384, %eax # imm = 0xCECE0000
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
   ret i32 %m
@@ -75,9 +72,8 @@ define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    addl $-825360384, %eax # imm = 0xCECE0000
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setg %al
 ; X86-NEXT:    retl
@@ -103,14 +99,12 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
 define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X86-LABEL: length2_lt:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -127,10 +121,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl (%ecx), %ecx
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    setg %al
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index 1330f3a241a5c2a..b8c0f509f1d081b 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -51,10 +51,8 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -65,9 +63,8 @@ define i32 @length2_const(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length2_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    addl $-825360384, %eax # imm = 0xCECE0000
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
   ret i32 %m
@@ -77,9 +74,8 @@ define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
 ; X64-LABEL: length2_gt_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    addl $-825360384, %eax # imm = 0xCECE0000
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
@@ -105,10 +101,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -123,10 +117,8 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
@@ -537,8 +529,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movl 8(%rdi), %ecx
 ; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    je .LBB31_3
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index f56d9688a01e12d..bd42d5f8d50859d 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -5,14 +5,14 @@ declare i32 @memcmp(ptr nocapture, ptr nocapture, i32)
 
 define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    ret i32 [[TMP9]]
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
+; X32-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
   ret i32 %call
@@ -20,14 +20,14 @@ define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
 ; X32-LABEL: @cmp2_align2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 2
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    ret i32 [[TMP9]]
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 2
+; X32-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
+; X32-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
   ret i32 %call
@@ -37,27 +37,27 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp3(
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X32-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X32-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X32-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X32-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X32-NEXT:    br label [[ENDBLOCK]]
 ; X32:       endblock:
-; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 3)
@@ -66,16 +66,16 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp4(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X32-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X32-NEXT:    ret i32 [[TMP11]]
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X32-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 4)
   ret i32 %call
@@ -85,27 +85,27 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp5(
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X32-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X32-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X32-NEXT:    br label [[ENDBLOCK]]
 ; X32:       endblock:
-; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 5)
@@ -116,29 +116,29 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp6(
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X32-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; X32-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
-; X32-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
-; X32-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
-; X32-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
+; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14]] = call i32 @llvm.bswap.i32(i32 [[TMP12]])
+; X32-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X32-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
@@ -151,27 +151,27 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp7(
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X32-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X32-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X32-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
@@ -184,27 +184,27 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp8(
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X32-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X32-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X32-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
@@ -287,11 +287,11 @@ define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -303,20 +303,20 @@ define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq3(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X32-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i16
-; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i16
-; X32-NEXT:    [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X32-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X32-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X32-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -328,11 +328,11 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq4(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -344,20 +344,20 @@ define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq5(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -369,20 +369,20 @@ define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq6(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -394,20 +394,20 @@ define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture readonly align 4 %y)  {
 ; X32-LABEL: @cmp_eq6_align4(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 4
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 4
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 4
-; X32-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 4
-; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 4
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 4
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; X32-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -419,18 +419,18 @@ define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture read
 
 define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq7(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X32-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X32-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -442,18 +442,18 @@ define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X32-LABEL: @cmp_eq8(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X32-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X32-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index 2594f53971393d4..57a242a77509aa6 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -6,14 +6,14 @@ declare i32 @memcmp(ptr nocapture, ptr nocapture, i64)
 
 define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp2(
-; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    ret i32 [[TMP9]]
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
+; X64-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
   ret i32 %call
@@ -21,14 +21,14 @@ define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
 ; X64-LABEL: @cmp2_align2(
-; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 2
-; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    ret i32 [[TMP9]]
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 2
+; X64-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
+; X64-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
   ret i32 %call
@@ -38,27 +38,27 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp3(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X64-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
@@ -67,16 +67,16 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp4(
-; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X64-NEXT:    ret i32 [[TMP11]]
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
   ret i32 %call
@@ -86,27 +86,27 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp5(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
@@ -117,29 +117,29 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp6(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
-; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
-; X64-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
-; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14]] = call i32 @llvm.bswap.i32(i32 [[TMP12]])
+; X64-NEXT:    [[TMP15]] = call i32 @llvm.bswap.i32(i32 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -152,27 +152,27 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp7(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -183,16 +183,16 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp8(
-; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X64-NEXT:    ret i32 [[TMP11]]
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
   ret i32 %call
@@ -202,27 +202,27 @@ define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp9(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 9)
@@ -233,29 +233,29 @@ define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp10(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i64
-; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i64
-; X64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
-; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i64
+; X64-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i64
+; X64-NEXT:    [[TMP14]] = call i64 @llvm.bswap.i64(i64 [[TMP12]])
+; X64-NEXT:    [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -268,27 +268,27 @@ define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp11(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -301,29 +301,29 @@ define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp12(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
-; X64-NEXT:    [[TMP19]] = zext i32 [[TMP17]] to i64
-; X64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
-; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP10]] to i64
+; X64-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
+; X64-NEXT:    [[TMP14]] = call i64 @llvm.bswap.i64(i64 [[TMP12]])
+; X64-NEXT:    [[TMP15]] = call i64 @llvm.bswap.i64(i64 [[TMP13]])
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -336,27 +336,27 @@ define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp13(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -369,27 +369,27 @@ define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp14(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -402,27 +402,27 @@ define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp15(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -435,27 +435,27 @@ define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp16(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -466,11 +466,11 @@ define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp_eq2(
-; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
@@ -486,17 +486,17 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X64_1LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -504,20 +504,20 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq3(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X64_2LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i16
-; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i16
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -529,11 +529,11 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp_eq4(
-; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
@@ -549,17 +549,17 @@ define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -567,20 +567,20 @@ define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq5(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i32
-; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -596,17 +596,17 @@ define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -614,20 +614,20 @@ define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq6(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -643,17 +643,17 @@ define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture read
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 4
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 4
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 4
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 4
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 4
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 4
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -661,20 +661,20 @@ define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture read
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq6_align4(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 4
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 4
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 4
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 4
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 4
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 4
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -690,17 +690,17 @@ define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -708,18 +708,18 @@ define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq7(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -731,11 +731,11 @@ define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp_eq8(
-; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
@@ -751,17 +751,17 @@ define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -769,20 +769,20 @@ define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq9(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i64
-; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i64
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -798,17 +798,17 @@ define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -816,20 +816,20 @@ define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq10(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i64
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i64
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -845,17 +845,17 @@ define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -863,18 +863,18 @@ define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq11(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -890,17 +890,17 @@ define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -908,20 +908,20 @@ define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq12(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP10]] to i64
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -937,17 +937,17 @@ define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -955,18 +955,18 @@ define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq13(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -982,17 +982,17 @@ define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -1000,18 +1000,18 @@ define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq14(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -1027,17 +1027,17 @@ define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
@@ -1045,18 +1045,18 @@ define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
 ; X64_2LD-LABEL: @cmp_eq15(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y:%.*]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -1068,11 +1068,11 @@ define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 
 define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 ; X64-LABEL: @cmp_eq16(
-; X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X:%.*]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y:%.*]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;

>From a6eeb1232a422fa2cba042a507d7fdadd4557e5d Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov at arm.com>
Date: Thu, 19 Oct 2023 16:01:10 +0000
Subject: [PATCH 2/2] Clang-format

---
 llvm/lib/CodeGen/ExpandMemCmp.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 40fbe877ab7ac09..22f8248f0185166 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -316,8 +316,8 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
   // Swap bytes if required.
   if (NeedsBSwap) {
     Type *BSwapType = CmpSizeType ? CmpSizeType : LoadSizeType;
-    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, BSwapType);
+    Function *Bswap =
+        Intrinsic::getDeclaration(CI->getModule(), Intrinsic::bswap, BSwapType);
     Lhs = Builder.CreateCall(Bswap, Lhs);
     Rhs = Builder.CreateCall(Bswap, Rhs);
   }



More information about the llvm-commits mailing list