[llvm] 54c2005 - [X86] MergeConsecutiveStores.ll - add 32-bit i686 coverage
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 27 03:35:33 PDT 2023
Author: Simon Pilgrim
Date: 2023-08-27T11:35:16+01:00
New Revision: 54c20055dad8a8db836545a25dd8359ce89b7146
URL: https://github.com/llvm/llvm-project/commit/54c20055dad8a8db836545a25dd8359ce89b7146
DIFF: https://github.com/llvm/llvm-project/commit/54c20055dad8a8db836545a25dd8359ce89b7146.diff
LOG: [X86] MergeConsecutiveStores.ll - add 32-bit i686 coverage
Added:
Modified:
llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
index b820023c961aa1..ecf38980573d64 100644
--- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefixes=X86,X86-BWON %s
+; RUN: llc -mtriple=i686-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefixes=X86,X86-BWOFF %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefixes=X64,X64-BWON %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefixes=X64,X64-BWOFF %s
%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
@@ -8,21 +10,39 @@
; save 1,2,3 ... as one big integer.
define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
-; CHECK-LABEL: merge_const_store:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB0_3
-; CHECK-NEXT: # %bb.1: # %.lr.ph.preheader
-; CHECK-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB0_2: # %.lr.ph
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movq %rax, (%rsi)
-; CHECK-NEXT: addq $8, %rsi
-; CHECK-NEXT: decl %edi
-; CHECK-NEXT: jne .LBB0_2
-; CHECK-NEXT: .LBB0_3: # %._crit_edge
-; CHECK-NEXT: retq
+; X86-LABEL: merge_const_store:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: jle .LBB0_3
+; X86-NEXT: # %bb.1: # %.lr.ph.preheader
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB0_2: # %.lr.ph
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl $67305985, (%ecx) # imm = 0x4030201
+; X86-NEXT: movl $134678021, 4(%ecx) # imm = 0x8070605
+; X86-NEXT: addl $8, %ecx
+; X86-NEXT: decl %eax
+; X86-NEXT: jne .LBB0_2
+; X86-NEXT: .LBB0_3: # %._crit_edge
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_const_store:
+; X64: # %bb.0:
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: jle .LBB0_3
+; X64-NEXT: # %bb.1: # %.lr.ph.preheader
+; X64-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB0_2: # %.lr.ph
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: movq %rax, (%rsi)
+; X64-NEXT: addq $8, %rsi
+; X64-NEXT: decl %edi
+; X64-NEXT: jne .LBB0_2
+; X64-NEXT: .LBB0_3: # %._crit_edge
+; X64-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
@@ -54,22 +74,46 @@ define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwt
; No vectors because we use noimplicitfloat
define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
-; CHECK-LABEL: merge_const_store_no_vec:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB1_2
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB1_1: # %.lr.ph
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movq $0, (%rsi)
-; CHECK-NEXT: movq $0, 8(%rsi)
-; CHECK-NEXT: movq $0, 16(%rsi)
-; CHECK-NEXT: movq $0, 24(%rsi)
-; CHECK-NEXT: addq $32, %rsi
-; CHECK-NEXT: decl %edi
-; CHECK-NEXT: jne .LBB1_1
-; CHECK-NEXT: .LBB1_2: # %._crit_edge
-; CHECK-NEXT: retq
+; X86-LABEL: merge_const_store_no_vec:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: jle .LBB1_3
+; X86-NEXT: # %bb.1: # %.lr.ph.preheader
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB1_2: # %.lr.ph
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl $0, (%ecx)
+; X86-NEXT: movl $0, 4(%ecx)
+; X86-NEXT: movl $0, 8(%ecx)
+; X86-NEXT: movl $0, 12(%ecx)
+; X86-NEXT: movl $0, 16(%ecx)
+; X86-NEXT: movl $0, 20(%ecx)
+; X86-NEXT: movl $0, 24(%ecx)
+; X86-NEXT: movl $0, 28(%ecx)
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: decl %eax
+; X86-NEXT: jne .LBB1_2
+; X86-NEXT: .LBB1_3: # %._crit_edge
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_const_store_no_vec:
+; X64: # %bb.0:
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: jle .LBB1_2
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB1_1: # %.lr.ph
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: movq $0, (%rsi)
+; X64-NEXT: movq $0, 8(%rsi)
+; X64-NEXT: movq $0, 16(%rsi)
+; X64-NEXT: movq $0, 24(%rsi)
+; X64-NEXT: addq $32, %rsi
+; X64-NEXT: decl %edi
+; X64-NEXT: jne .LBB1_1
+; X64-NEXT: .LBB1_2: # %._crit_edge
+; X64-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
@@ -101,22 +145,41 @@ define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimp
; Move the constants using a single vector store.
define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
-; CHECK-LABEL: merge_const_store_vec:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB2_3
-; CHECK-NEXT: # %bb.1: # %.lr.ph.preheader
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB2_2: # %.lr.ph
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups %ymm0, (%rsi)
-; CHECK-NEXT: addq $32, %rsi
-; CHECK-NEXT: decl %edi
-; CHECK-NEXT: jne .LBB2_2
-; CHECK-NEXT: .LBB2_3: # %._crit_edge
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X86-LABEL: merge_const_store_vec:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: jle .LBB2_3
+; X86-NEXT: # %bb.1: # %.lr.ph.preheader
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB2_2: # %.lr.ph
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: vmovups %ymm0, (%ecx)
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: decl %eax
+; X86-NEXT: jne .LBB2_2
+; X86-NEXT: .LBB2_3: # %._crit_edge
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_const_store_vec:
+; X64: # %bb.0:
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: jle .LBB2_3
+; X64-NEXT: # %bb.1: # %.lr.ph.preheader
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB2_2: # %.lr.ph
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: vmovups %ymm0, (%rsi)
+; X64-NEXT: addq $32, %rsi
+; X64-NEXT: decl %edi
+; X64-NEXT: jne .LBB2_2
+; X64-NEXT: .LBB2_3: # %._crit_edge
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
@@ -148,22 +211,64 @@ define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind
; Move the first 4 constants as a single vector. Move the rest as scalars.
define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
-; CHECK-LABEL: merge_nonconst_store:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB3_2
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB3_1: # %.lr.ph
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl $67305985, (%rdx) # imm = 0x4030201
-; CHECK-NEXT: movb %sil, 4(%rdx)
-; CHECK-NEXT: movw $1798, 5(%rdx) # imm = 0x706
-; CHECK-NEXT: movb $8, 7(%rdx)
-; CHECK-NEXT: addq $8, %rdx
-; CHECK-NEXT: decl %edi
-; CHECK-NEXT: jne .LBB3_1
-; CHECK-NEXT: .LBB3_2: # %._crit_edge
-; CHECK-NEXT: retq
+; X86-BWON-LABEL: merge_nonconst_store:
+; X86-BWON: # %bb.0:
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BWON-NEXT: testl %eax, %eax
+; X86-BWON-NEXT: jle .LBB3_3
+; X86-BWON-NEXT: # %bb.1: # %.lr.ph.preheader
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWON-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-BWON-NEXT: .p2align 4, 0x90
+; X86-BWON-NEXT: .LBB3_2: # %.lr.ph
+; X86-BWON-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-BWON-NEXT: movl $67305985, (%ecx) # imm = 0x4030201
+; X86-BWON-NEXT: movb %dl, 4(%ecx)
+; X86-BWON-NEXT: movw $1798, 5(%ecx) # imm = 0x706
+; X86-BWON-NEXT: movb $8, 7(%ecx)
+; X86-BWON-NEXT: addl $8, %ecx
+; X86-BWON-NEXT: decl %eax
+; X86-BWON-NEXT: jne .LBB3_2
+; X86-BWON-NEXT: .LBB3_3: # %._crit_edge
+; X86-BWON-NEXT: retl
+;
+; X86-BWOFF-LABEL: merge_nonconst_store:
+; X86-BWOFF: # %bb.0:
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BWOFF-NEXT: testl %eax, %eax
+; X86-BWOFF-NEXT: jle .LBB3_3
+; X86-BWOFF-NEXT: # %bb.1: # %.lr.ph.preheader
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWOFF-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-BWOFF-NEXT: .p2align 4, 0x90
+; X86-BWOFF-NEXT: .LBB3_2: # %.lr.ph
+; X86-BWOFF-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-BWOFF-NEXT: movl $67305985, (%ecx) # imm = 0x4030201
+; X86-BWOFF-NEXT: movb %dl, 4(%ecx)
+; X86-BWOFF-NEXT: movw $1798, 5(%ecx) # imm = 0x706
+; X86-BWOFF-NEXT: movb $8, 7(%ecx)
+; X86-BWOFF-NEXT: addl $8, %ecx
+; X86-BWOFF-NEXT: decl %eax
+; X86-BWOFF-NEXT: jne .LBB3_2
+; X86-BWOFF-NEXT: .LBB3_3: # %._crit_edge
+; X86-BWOFF-NEXT: retl
+;
+; X64-LABEL: merge_nonconst_store:
+; X64: # %bb.0:
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: jle .LBB3_2
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB3_1: # %.lr.ph
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: movl $67305985, (%rdx) # imm = 0x4030201
+; X64-NEXT: movb %sil, 4(%rdx)
+; X64-NEXT: movw $1798, 5(%rdx) # imm = 0x706
+; X64-NEXT: movb $8, 7(%rdx)
+; X64-NEXT: addq $8, %rdx
+; X64-NEXT: decl %edi
+; X64-NEXT: jne .LBB3_1
+; X64-NEXT: .LBB3_2: # %._crit_edge
+; X64-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
@@ -194,33 +299,79 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) n
}
define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
-; BWON-LABEL: merge_loads_i16:
-; BWON: # %bb.0:
-; BWON-NEXT: testl %edi, %edi
-; BWON-NEXT: jle .LBB4_2
-; BWON-NEXT: .p2align 4, 0x90
-; BWON-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movzwl (%rsi), %eax
-; BWON-NEXT: movw %ax, (%rdx)
-; BWON-NEXT: addq $8, %rdx
-; BWON-NEXT: decl %edi
-; BWON-NEXT: jne .LBB4_1
-; BWON-NEXT: .LBB4_2: # %._crit_edge
-; BWON-NEXT: retq
-;
-; BWOFF-LABEL: merge_loads_i16:
-; BWOFF: # %bb.0:
-; BWOFF-NEXT: testl %edi, %edi
-; BWOFF-NEXT: jle .LBB4_2
-; BWOFF-NEXT: .p2align 4, 0x90
-; BWOFF-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movw (%rsi), %ax
-; BWOFF-NEXT: movw %ax, (%rdx)
-; BWOFF-NEXT: addq $8, %rdx
-; BWOFF-NEXT: decl %edi
-; BWOFF-NEXT: jne .LBB4_1
-; BWOFF-NEXT: .LBB4_2: # %._crit_edge
-; BWOFF-NEXT: retq
+; X86-BWON-LABEL: merge_loads_i16:
+; X86-BWON: # %bb.0:
+; X86-BWON-NEXT: pushl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: .cfi_offset %esi, -8
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BWON-NEXT: testl %eax, %eax
+; X86-BWON-NEXT: jle .LBB4_3
+; X86-BWON-NEXT: # %bb.1: # %.lr.ph
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWON-NEXT: .p2align 4, 0x90
+; X86-BWON-NEXT: .LBB4_2: # =>This Inner Loop Header: Depth=1
+; X86-BWON-NEXT: movzwl (%edx), %esi
+; X86-BWON-NEXT: movw %si, (%ecx)
+; X86-BWON-NEXT: addl $8, %ecx
+; X86-BWON-NEXT: decl %eax
+; X86-BWON-NEXT: jne .LBB4_2
+; X86-BWON-NEXT: .LBB4_3: # %._crit_edge
+; X86-BWON-NEXT: popl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 4
+; X86-BWON-NEXT: retl
+;
+; X86-BWOFF-LABEL: merge_loads_i16:
+; X86-BWOFF: # %bb.0:
+; X86-BWOFF-NEXT: pushl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: .cfi_offset %esi, -8
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BWOFF-NEXT: testl %eax, %eax
+; X86-BWOFF-NEXT: jle .LBB4_3
+; X86-BWOFF-NEXT: # %bb.1: # %.lr.ph
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWOFF-NEXT: .p2align 4, 0x90
+; X86-BWOFF-NEXT: .LBB4_2: # =>This Inner Loop Header: Depth=1
+; X86-BWOFF-NEXT: movw (%edx), %si
+; X86-BWOFF-NEXT: movw %si, (%ecx)
+; X86-BWOFF-NEXT: addl $8, %ecx
+; X86-BWOFF-NEXT: decl %eax
+; X86-BWOFF-NEXT: jne .LBB4_2
+; X86-BWOFF-NEXT: .LBB4_3: # %._crit_edge
+; X86-BWOFF-NEXT: popl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
+; X86-BWOFF-NEXT: retl
+;
+; X64-BWON-LABEL: merge_loads_i16:
+; X64-BWON: # %bb.0:
+; X64-BWON-NEXT: testl %edi, %edi
+; X64-BWON-NEXT: jle .LBB4_2
+; X64-BWON-NEXT: .p2align 4, 0x90
+; X64-BWON-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
+; X64-BWON-NEXT: movzwl (%rsi), %eax
+; X64-BWON-NEXT: movw %ax, (%rdx)
+; X64-BWON-NEXT: addq $8, %rdx
+; X64-BWON-NEXT: decl %edi
+; X64-BWON-NEXT: jne .LBB4_1
+; X64-BWON-NEXT: .LBB4_2: # %._crit_edge
+; X64-BWON-NEXT: retq
+;
+; X64-BWOFF-LABEL: merge_loads_i16:
+; X64-BWOFF: # %bb.0:
+; X64-BWOFF-NEXT: testl %edi, %edi
+; X64-BWOFF-NEXT: jle .LBB4_2
+; X64-BWOFF-NEXT: .p2align 4, 0x90
+; X64-BWOFF-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
+; X64-BWOFF-NEXT: movw (%rsi), %ax
+; X64-BWOFF-NEXT: movw %ax, (%rdx)
+; X64-BWOFF-NEXT: addq $8, %rdx
+; X64-BWOFF-NEXT: decl %edi
+; X64-BWOFF-NEXT: jne .LBB4_1
+; X64-BWOFF-NEXT: .LBB4_2: # %._crit_edge
+; X64-BWOFF-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -249,39 +400,91 @@ define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struc
; The loads and the stores are interleaved. Can't merge them.
define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
-; BWON-LABEL: no_merge_loads:
-; BWON: # %bb.0:
-; BWON-NEXT: testl %edi, %edi
-; BWON-NEXT: jle .LBB5_2
-; BWON-NEXT: .p2align 4, 0x90
-; BWON-NEXT: .LBB5_1: # %a4
-; BWON-NEXT: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movzbl (%rsi), %eax
-; BWON-NEXT: movb %al, (%rdx)
-; BWON-NEXT: movzbl 1(%rsi), %eax
-; BWON-NEXT: movb %al, 1(%rdx)
-; BWON-NEXT: addq $8, %rdx
-; BWON-NEXT: decl %edi
-; BWON-NEXT: jne .LBB5_1
-; BWON-NEXT: .LBB5_2: # %._crit_edge
-; BWON-NEXT: retq
-;
-; BWOFF-LABEL: no_merge_loads:
-; BWOFF: # %bb.0:
-; BWOFF-NEXT: testl %edi, %edi
-; BWOFF-NEXT: jle .LBB5_2
-; BWOFF-NEXT: .p2align 4, 0x90
-; BWOFF-NEXT: .LBB5_1: # %a4
-; BWOFF-NEXT: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movb (%rsi), %al
-; BWOFF-NEXT: movb %al, (%rdx)
-; BWOFF-NEXT: movb 1(%rsi), %al
-; BWOFF-NEXT: movb %al, 1(%rdx)
-; BWOFF-NEXT: addq $8, %rdx
-; BWOFF-NEXT: decl %edi
-; BWOFF-NEXT: jne .LBB5_1
-; BWOFF-NEXT: .LBB5_2: # %._crit_edge
-; BWOFF-NEXT: retq
+; X86-BWON-LABEL: no_merge_loads:
+; X86-BWON: # %bb.0:
+; X86-BWON-NEXT: pushl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: .cfi_offset %ebx, -8
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BWON-NEXT: testl %eax, %eax
+; X86-BWON-NEXT: jle .LBB5_3
+; X86-BWON-NEXT: # %bb.1: # %.lr.ph
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWON-NEXT: .p2align 4, 0x90
+; X86-BWON-NEXT: .LBB5_2: # %a4
+; X86-BWON-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-BWON-NEXT: movzbl (%edx), %ebx
+; X86-BWON-NEXT: movb %bl, (%ecx)
+; X86-BWON-NEXT: movzbl 1(%edx), %ebx
+; X86-BWON-NEXT: movb %bl, 1(%ecx)
+; X86-BWON-NEXT: addl $8, %ecx
+; X86-BWON-NEXT: decl %eax
+; X86-BWON-NEXT: jne .LBB5_2
+; X86-BWON-NEXT: .LBB5_3: # %._crit_edge
+; X86-BWON-NEXT: popl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 4
+; X86-BWON-NEXT: retl
+;
+; X86-BWOFF-LABEL: no_merge_loads:
+; X86-BWOFF: # %bb.0:
+; X86-BWOFF-NEXT: pushl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: .cfi_offset %ebx, -8
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BWOFF-NEXT: testl %eax, %eax
+; X86-BWOFF-NEXT: jle .LBB5_3
+; X86-BWOFF-NEXT: # %bb.1: # %.lr.ph
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWOFF-NEXT: .p2align 4, 0x90
+; X86-BWOFF-NEXT: .LBB5_2: # %a4
+; X86-BWOFF-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-BWOFF-NEXT: movb (%edx), %bl
+; X86-BWOFF-NEXT: movb %bl, (%ecx)
+; X86-BWOFF-NEXT: movb 1(%edx), %bl
+; X86-BWOFF-NEXT: movb %bl, 1(%ecx)
+; X86-BWOFF-NEXT: addl $8, %ecx
+; X86-BWOFF-NEXT: decl %eax
+; X86-BWOFF-NEXT: jne .LBB5_2
+; X86-BWOFF-NEXT: .LBB5_3: # %._crit_edge
+; X86-BWOFF-NEXT: popl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
+; X86-BWOFF-NEXT: retl
+;
+; X64-BWON-LABEL: no_merge_loads:
+; X64-BWON: # %bb.0:
+; X64-BWON-NEXT: testl %edi, %edi
+; X64-BWON-NEXT: jle .LBB5_2
+; X64-BWON-NEXT: .p2align 4, 0x90
+; X64-BWON-NEXT: .LBB5_1: # %a4
+; X64-BWON-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-BWON-NEXT: movzbl (%rsi), %eax
+; X64-BWON-NEXT: movb %al, (%rdx)
+; X64-BWON-NEXT: movzbl 1(%rsi), %eax
+; X64-BWON-NEXT: movb %al, 1(%rdx)
+; X64-BWON-NEXT: addq $8, %rdx
+; X64-BWON-NEXT: decl %edi
+; X64-BWON-NEXT: jne .LBB5_1
+; X64-BWON-NEXT: .LBB5_2: # %._crit_edge
+; X64-BWON-NEXT: retq
+;
+; X64-BWOFF-LABEL: no_merge_loads:
+; X64-BWOFF: # %bb.0:
+; X64-BWOFF-NEXT: testl %edi, %edi
+; X64-BWOFF-NEXT: jle .LBB5_2
+; X64-BWOFF-NEXT: .p2align 4, 0x90
+; X64-BWOFF-NEXT: .LBB5_1: # %a4
+; X64-BWOFF-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-BWOFF-NEXT: movb (%rsi), %al
+; X64-BWOFF-NEXT: movb %al, (%rdx)
+; X64-BWOFF-NEXT: movb 1(%rsi), %al
+; X64-BWOFF-NEXT: movb %al, 1(%rdx)
+; X64-BWOFF-NEXT: addq $8, %rdx
+; X64-BWOFF-NEXT: decl %edi
+; X64-BWOFF-NEXT: jne .LBB5_1
+; X64-BWOFF-NEXT: .LBB5_2: # %._crit_edge
+; X64-BWOFF-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -309,19 +512,49 @@ a4: ; preds = %4, %.lr.ph
}
define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
-; CHECK-LABEL: merge_loads_integer:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB6_2
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movq (%rsi), %rax
-; CHECK-NEXT: movq %rax, (%rdx)
-; CHECK-NEXT: addq $32, %rdx
-; CHECK-NEXT: decl %edi
-; CHECK-NEXT: jne .LBB6_1
-; CHECK-NEXT: .LBB6_2: # %._crit_edge
-; CHECK-NEXT: retq
+; X86-LABEL: merge_loads_integer:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 12
+; X86-NEXT: .cfi_offset %esi, -12
+; X86-NEXT: .cfi_offset %edi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: jle .LBB6_3
+; X86-NEXT: # %bb.1: # %.lr.ph
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB6_2: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl (%edx), %esi
+; X86-NEXT: movl 4(%edx), %edi
+; X86-NEXT: movl %esi, (%ecx)
+; X86-NEXT: movl %edi, 4(%ecx)
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: decl %eax
+; X86-NEXT: jne .LBB6_2
+; X86-NEXT: .LBB6_3: # %._crit_edge
+; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 4
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_loads_integer:
+; X64: # %bb.0:
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: jle .LBB6_2
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: movq (%rsi), %rax
+; X64-NEXT: movq %rax, (%rdx)
+; X64-NEXT: addq $32, %rdx
+; X64-NEXT: decl %edi
+; X64-NEXT: jne .LBB6_1
+; X64-NEXT: .LBB6_2: # %._crit_edge
+; X64-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -349,20 +582,39 @@ define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %s
}
define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
-; CHECK-LABEL: merge_loads_vector:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB7_2
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB7_1: # %block4
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups (%rsi), %xmm0
-; CHECK-NEXT: vmovups %xmm0, (%rdx)
-; CHECK-NEXT: addq $32, %rdx
-; CHECK-NEXT: decl %edi
-; CHECK-NEXT: jne .LBB7_1
-; CHECK-NEXT: .LBB7_2: # %._crit_edge
-; CHECK-NEXT: retq
+; X86-LABEL: merge_loads_vector:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: jle .LBB7_3
+; X86-NEXT: # %bb.1: # %.lr.ph
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB7_2: # %block4
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: vmovups (%edx), %xmm0
+; X86-NEXT: vmovups %xmm0, (%ecx)
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: decl %eax
+; X86-NEXT: jne .LBB7_2
+; X86-NEXT: .LBB7_3: # %._crit_edge
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_loads_vector:
+; X64: # %bb.0:
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: jle .LBB7_2
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB7_1: # %block4
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: vmovups (%rsi), %xmm0
+; X64-NEXT: vmovups %xmm0, (%rdx)
+; X64-NEXT: addq $32, %rdx
+; X64-NEXT: decl %edi
+; X64-NEXT: jne .LBB7_1
+; X64-NEXT: .LBB7_2: # %._crit_edge
+; X64-NEXT: retq
%a1 = icmp sgt i32 %count, 0
br i1 %a1, label %.lr.ph, label %._crit_edge
@@ -399,20 +651,39 @@ block4: ; preds = %4, %.lr.ph
; On x86, even unaligned copies can be merged to vector ops.
define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
-; CHECK-LABEL: merge_loads_no_align:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB8_2
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB8_1: # %block4
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmovups (%rsi), %xmm0
-; CHECK-NEXT: vmovups %xmm0, (%rdx)
-; CHECK-NEXT: addq $32, %rdx
-; CHECK-NEXT: decl %edi
-; CHECK-NEXT: jne .LBB8_1
-; CHECK-NEXT: .LBB8_2: # %._crit_edge
-; CHECK-NEXT: retq
+; X86-LABEL: merge_loads_no_align:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: jle .LBB8_3
+; X86-NEXT: # %bb.1: # %.lr.ph
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB8_2: # %block4
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: vmovups (%edx), %xmm0
+; X86-NEXT: vmovups %xmm0, (%ecx)
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: decl %eax
+; X86-NEXT: jne .LBB8_2
+; X86-NEXT: .LBB8_3: # %._crit_edge
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_loads_no_align:
+; X64: # %bb.0:
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: jle .LBB8_2
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB8_1: # %block4
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: vmovups (%rsi), %xmm0
+; X64-NEXT: vmovups %xmm0, (%rdx)
+; X64-NEXT: addq $32, %rdx
+; X64-NEXT: decl %edi
+; X64-NEXT: jne .LBB8_1
+; X64-NEXT: .LBB8_2: # %._crit_edge
+; X64-NEXT: retq
%a1 = icmp sgt i32 %count, 0
br i1 %a1, label %.lr.ph, label %._crit_edge
@@ -450,35 +721,101 @@ block4: ; preds = %4, %.lr.ph
; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy.
define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
-; BWON-LABEL: MergeLoadStoreBaseIndexOffset:
-; BWON: # %bb.0:
-; BWON-NEXT: movl %ecx, %eax
-; BWON-NEXT: xorl %ecx, %ecx
-; BWON-NEXT: .p2align 4, 0x90
-; BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movq (%rdi,%rcx,8), %r8
-; BWON-NEXT: movzwl (%rdx,%r8), %r8d
-; BWON-NEXT: movw %r8w, (%rsi,%rcx,2)
-; BWON-NEXT: incq %rcx
-; BWON-NEXT: cmpl %ecx, %eax
-; BWON-NEXT: jne .LBB9_1
-; BWON-NEXT: # %bb.2:
-; BWON-NEXT: retq
-;
-; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
-; BWOFF: # %bb.0:
-; BWOFF-NEXT: movl %ecx, %eax
-; BWOFF-NEXT: xorl %ecx, %ecx
-; BWOFF-NEXT: .p2align 4, 0x90
-; BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movq (%rdi,%rcx,8), %r8
-; BWOFF-NEXT: movw (%rdx,%r8), %r8w
-; BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2)
-; BWOFF-NEXT: incq %rcx
-; BWOFF-NEXT: cmpl %ecx, %eax
-; BWOFF-NEXT: jne .LBB9_1
-; BWOFF-NEXT: # %bb.2:
-; BWOFF-NEXT: retq
+; X86-BWON-LABEL: MergeLoadStoreBaseIndexOffset:
+; X86-BWON: # %bb.0:
+; X86-BWON-NEXT: pushl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: pushl %edi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 12
+; X86-BWON-NEXT: pushl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 16
+; X86-BWON-NEXT: .cfi_offset %esi, -16
+; X86-BWON-NEXT: .cfi_offset %edi, -12
+; X86-BWON-NEXT: .cfi_offset %ebx, -8
+; X86-BWON-NEXT: xorl %eax, %eax
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BWON-NEXT: .p2align 4, 0x90
+; X86-BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
+; X86-BWON-NEXT: movl (%edi,%eax,8), %ebx
+; X86-BWON-NEXT: movzwl (%edx,%ebx), %ebx
+; X86-BWON-NEXT: movw %bx, (%esi,%eax,2)
+; X86-BWON-NEXT: incl %eax
+; X86-BWON-NEXT: cmpl %eax, %ecx
+; X86-BWON-NEXT: jne .LBB9_1
+; X86-BWON-NEXT: # %bb.2:
+; X86-BWON-NEXT: popl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 12
+; X86-BWON-NEXT: popl %edi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: popl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 4
+; X86-BWON-NEXT: retl
+;
+; X86-BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
+; X86-BWOFF: # %bb.0:
+; X86-BWOFF-NEXT: pushl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: pushl %edi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
+; X86-BWOFF-NEXT: pushl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
+; X86-BWOFF-NEXT: .cfi_offset %esi, -16
+; X86-BWOFF-NEXT: .cfi_offset %edi, -12
+; X86-BWOFF-NEXT: .cfi_offset %ebx, -8
+; X86-BWOFF-NEXT: xorl %eax, %eax
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BWOFF-NEXT: .p2align 4, 0x90
+; X86-BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
+; X86-BWOFF-NEXT: movl (%edi,%eax,8), %ebx
+; X86-BWOFF-NEXT: movw (%edx,%ebx), %bx
+; X86-BWOFF-NEXT: movw %bx, (%esi,%eax,2)
+; X86-BWOFF-NEXT: incl %eax
+; X86-BWOFF-NEXT: cmpl %eax, %ecx
+; X86-BWOFF-NEXT: jne .LBB9_1
+; X86-BWOFF-NEXT: # %bb.2:
+; X86-BWOFF-NEXT: popl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
+; X86-BWOFF-NEXT: popl %edi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: popl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
+; X86-BWOFF-NEXT: retl
+;
+; X64-BWON-LABEL: MergeLoadStoreBaseIndexOffset:
+; X64-BWON: # %bb.0:
+; X64-BWON-NEXT: movl %ecx, %eax
+; X64-BWON-NEXT: xorl %ecx, %ecx
+; X64-BWON-NEXT: .p2align 4, 0x90
+; X64-BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
+; X64-BWON-NEXT: movq (%rdi,%rcx,8), %r8
+; X64-BWON-NEXT: movzwl (%rdx,%r8), %r8d
+; X64-BWON-NEXT: movw %r8w, (%rsi,%rcx,2)
+; X64-BWON-NEXT: incq %rcx
+; X64-BWON-NEXT: cmpl %ecx, %eax
+; X64-BWON-NEXT: jne .LBB9_1
+; X64-BWON-NEXT: # %bb.2:
+; X64-BWON-NEXT: retq
+;
+; X64-BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
+; X64-BWOFF: # %bb.0:
+; X64-BWOFF-NEXT: movl %ecx, %eax
+; X64-BWOFF-NEXT: xorl %ecx, %ecx
+; X64-BWOFF-NEXT: .p2align 4, 0x90
+; X64-BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
+; X64-BWOFF-NEXT: movq (%rdi,%rcx,8), %r8
+; X64-BWOFF-NEXT: movw (%rdx,%r8), %r8w
+; X64-BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2)
+; X64-BWOFF-NEXT: incq %rcx
+; X64-BWOFF-NEXT: cmpl %ecx, %eax
+; X64-BWOFF-NEXT: jne .LBB9_1
+; X64-BWOFF-NEXT: # %bb.2:
+; X64-BWOFF-NEXT: retq
br label %1
; <label>:1
@@ -507,35 +844,127 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy for complicated address calculation.
define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
-; BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
-; BWON: # %bb.0:
-; BWON-NEXT: xorl %eax, %eax
-; BWON-NEXT: .p2align 4, 0x90
-; BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movsbq (%rsi), %r8
-; BWON-NEXT: movzwl (%rdx,%r8), %r8d
-; BWON-NEXT: movw %r8w, (%rdi,%rax)
-; BWON-NEXT: incq %rsi
-; BWON-NEXT: addq $2, %rax
-; BWON-NEXT: cmpq %rcx, %rax
-; BWON-NEXT: jl .LBB10_1
-; BWON-NEXT: # %bb.2:
-; BWON-NEXT: retq
-;
-; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
-; BWOFF: # %bb.0:
-; BWOFF-NEXT: xorl %eax, %eax
-; BWOFF-NEXT: .p2align 4, 0x90
-; BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movsbq (%rsi), %r8
-; BWOFF-NEXT: movw (%rdx,%r8), %r8w
-; BWOFF-NEXT: movw %r8w, (%rdi,%rax)
-; BWOFF-NEXT: incq %rsi
-; BWOFF-NEXT: addq $2, %rax
-; BWOFF-NEXT: cmpq %rcx, %rax
-; BWOFF-NEXT: jl .LBB10_1
-; BWOFF-NEXT: # %bb.2:
-; BWOFF-NEXT: retq
+; X86-BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
+; X86-BWON: # %bb.0:
+; X86-BWON-NEXT: pushl %ebp
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: pushl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 12
+; X86-BWON-NEXT: pushl %edi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 16
+; X86-BWON-NEXT: pushl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 20
+; X86-BWON-NEXT: .cfi_offset %esi, -20
+; X86-BWON-NEXT: .cfi_offset %edi, -16
+; X86-BWON-NEXT: .cfi_offset %ebx, -12
+; X86-BWON-NEXT: .cfi_offset %ebp, -8
+; X86-BWON-NEXT: xorl %eax, %eax
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BWON-NEXT: xorl %ebp, %ebp
+; X86-BWON-NEXT: .p2align 4, 0x90
+; X86-BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
+; X86-BWON-NEXT: movsbl (%edi), %ecx
+; X86-BWON-NEXT: movzbl (%esi,%ecx), %edx
+; X86-BWON-NEXT: movzbl 1(%esi,%ecx), %ecx
+; X86-BWON-NEXT: movb %dl, (%ebx,%eax)
+; X86-BWON-NEXT: movl %eax, %edx
+; X86-BWON-NEXT: orl $1, %edx
+; X86-BWON-NEXT: movb %cl, (%ebx,%edx)
+; X86-BWON-NEXT: incl %edi
+; X86-BWON-NEXT: addl $2, %eax
+; X86-BWON-NEXT: adcl $0, %ebp
+; X86-BWON-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-BWON-NEXT: movl %ebp, %ecx
+; X86-BWON-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
+; X86-BWON-NEXT: jl .LBB10_1
+; X86-BWON-NEXT: # %bb.2:
+; X86-BWON-NEXT: popl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 16
+; X86-BWON-NEXT: popl %edi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 12
+; X86-BWON-NEXT: popl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: popl %ebp
+; X86-BWON-NEXT: .cfi_def_cfa_offset 4
+; X86-BWON-NEXT: retl
+;
+; X86-BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
+; X86-BWOFF: # %bb.0:
+; X86-BWOFF-NEXT: pushl %ebp
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: pushl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
+; X86-BWOFF-NEXT: pushl %edi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
+; X86-BWOFF-NEXT: pushl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 20
+; X86-BWOFF-NEXT: .cfi_offset %esi, -20
+; X86-BWOFF-NEXT: .cfi_offset %edi, -16
+; X86-BWOFF-NEXT: .cfi_offset %ebx, -12
+; X86-BWOFF-NEXT: .cfi_offset %ebp, -8
+; X86-BWOFF-NEXT: xorl %eax, %eax
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-BWOFF-NEXT: xorl %ebp, %ebp
+; X86-BWOFF-NEXT: .p2align 4, 0x90
+; X86-BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
+; X86-BWOFF-NEXT: movsbl (%edi), %ecx
+; X86-BWOFF-NEXT: movb (%esi,%ecx), %dl
+; X86-BWOFF-NEXT: movb 1(%esi,%ecx), %cl
+; X86-BWOFF-NEXT: movb %dl, (%ebx,%eax)
+; X86-BWOFF-NEXT: movl %eax, %edx
+; X86-BWOFF-NEXT: orl $1, %edx
+; X86-BWOFF-NEXT: movb %cl, (%ebx,%edx)
+; X86-BWOFF-NEXT: incl %edi
+; X86-BWOFF-NEXT: addl $2, %eax
+; X86-BWOFF-NEXT: adcl $0, %ebp
+; X86-BWOFF-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-BWOFF-NEXT: movl %ebp, %ecx
+; X86-BWOFF-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
+; X86-BWOFF-NEXT: jl .LBB10_1
+; X86-BWOFF-NEXT: # %bb.2:
+; X86-BWOFF-NEXT: popl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
+; X86-BWOFF-NEXT: popl %edi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
+; X86-BWOFF-NEXT: popl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: popl %ebp
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
+; X86-BWOFF-NEXT: retl
+;
+; X64-BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
+; X64-BWON: # %bb.0:
+; X64-BWON-NEXT: xorl %eax, %eax
+; X64-BWON-NEXT: .p2align 4, 0x90
+; X64-BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
+; X64-BWON-NEXT: movsbq (%rsi), %r8
+; X64-BWON-NEXT: movzwl (%rdx,%r8), %r8d
+; X64-BWON-NEXT: movw %r8w, (%rdi,%rax)
+; X64-BWON-NEXT: incq %rsi
+; X64-BWON-NEXT: addq $2, %rax
+; X64-BWON-NEXT: cmpq %rcx, %rax
+; X64-BWON-NEXT: jl .LBB10_1
+; X64-BWON-NEXT: # %bb.2:
+; X64-BWON-NEXT: retq
+;
+; X64-BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
+; X64-BWOFF: # %bb.0:
+; X64-BWOFF-NEXT: xorl %eax, %eax
+; X64-BWOFF-NEXT: .p2align 4, 0x90
+; X64-BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
+; X64-BWOFF-NEXT: movsbq (%rsi), %r8
+; X64-BWOFF-NEXT: movw (%rdx,%r8), %r8w
+; X64-BWOFF-NEXT: movw %r8w, (%rdi,%rax)
+; X64-BWOFF-NEXT: incq %rsi
+; X64-BWOFF-NEXT: addq $2, %rax
+; X64-BWOFF-NEXT: cmpq %rcx, %rax
+; X64-BWOFF-NEXT: jl .LBB10_1
+; X64-BWOFF-NEXT: # %bb.2:
+; X64-BWOFF-NEXT: retq
br label %1
; <label>:1
@@ -566,35 +995,101 @@ define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i6
; word (16 bit) instead of a byte copy even if there are intermediate sign
; extensions.
define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
-; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
-; BWON: # %bb.0:
-; BWON-NEXT: movl %ecx, %eax
-; BWON-NEXT: xorl %ecx, %ecx
-; BWON-NEXT: .p2align 4, 0x90
-; BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movsbq (%rdi,%rcx), %r8
-; BWON-NEXT: movzwl (%rdx,%r8), %r8d
-; BWON-NEXT: movw %r8w, (%rsi,%rcx,2)
-; BWON-NEXT: incq %rcx
-; BWON-NEXT: cmpl %ecx, %eax
-; BWON-NEXT: jne .LBB11_1
-; BWON-NEXT: # %bb.2:
-; BWON-NEXT: retq
-;
-; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
-; BWOFF: # %bb.0:
-; BWOFF-NEXT: movl %ecx, %eax
-; BWOFF-NEXT: xorl %ecx, %ecx
-; BWOFF-NEXT: .p2align 4, 0x90
-; BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movsbq (%rdi,%rcx), %r8
-; BWOFF-NEXT: movw (%rdx,%r8), %r8w
-; BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2)
-; BWOFF-NEXT: incq %rcx
-; BWOFF-NEXT: cmpl %ecx, %eax
-; BWOFF-NEXT: jne .LBB11_1
-; BWOFF-NEXT: # %bb.2:
-; BWOFF-NEXT: retq
+; X86-BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
+; X86-BWON: # %bb.0:
+; X86-BWON-NEXT: pushl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: pushl %edi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 12
+; X86-BWON-NEXT: pushl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 16
+; X86-BWON-NEXT: .cfi_offset %esi, -16
+; X86-BWON-NEXT: .cfi_offset %edi, -12
+; X86-BWON-NEXT: .cfi_offset %ebx, -8
+; X86-BWON-NEXT: xorl %eax, %eax
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BWON-NEXT: .p2align 4, 0x90
+; X86-BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; X86-BWON-NEXT: movsbl (%edi,%eax), %ebx
+; X86-BWON-NEXT: movzwl (%edx,%ebx), %ebx
+; X86-BWON-NEXT: movw %bx, (%esi,%eax,2)
+; X86-BWON-NEXT: incl %eax
+; X86-BWON-NEXT: cmpl %eax, %ecx
+; X86-BWON-NEXT: jne .LBB11_1
+; X86-BWON-NEXT: # %bb.2:
+; X86-BWON-NEXT: popl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 12
+; X86-BWON-NEXT: popl %edi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: popl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 4
+; X86-BWON-NEXT: retl
+;
+; X86-BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
+; X86-BWOFF: # %bb.0:
+; X86-BWOFF-NEXT: pushl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: pushl %edi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
+; X86-BWOFF-NEXT: pushl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
+; X86-BWOFF-NEXT: .cfi_offset %esi, -16
+; X86-BWOFF-NEXT: .cfi_offset %edi, -12
+; X86-BWOFF-NEXT: .cfi_offset %ebx, -8
+; X86-BWOFF-NEXT: xorl %eax, %eax
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BWOFF-NEXT: .p2align 4, 0x90
+; X86-BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; X86-BWOFF-NEXT: movsbl (%edi,%eax), %ebx
+; X86-BWOFF-NEXT: movw (%edx,%ebx), %bx
+; X86-BWOFF-NEXT: movw %bx, (%esi,%eax,2)
+; X86-BWOFF-NEXT: incl %eax
+; X86-BWOFF-NEXT: cmpl %eax, %ecx
+; X86-BWOFF-NEXT: jne .LBB11_1
+; X86-BWOFF-NEXT: # %bb.2:
+; X86-BWOFF-NEXT: popl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
+; X86-BWOFF-NEXT: popl %edi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: popl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
+; X86-BWOFF-NEXT: retl
+;
+; X64-BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
+; X64-BWON: # %bb.0:
+; X64-BWON-NEXT: movl %ecx, %eax
+; X64-BWON-NEXT: xorl %ecx, %ecx
+; X64-BWON-NEXT: .p2align 4, 0x90
+; X64-BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; X64-BWON-NEXT: movsbq (%rdi,%rcx), %r8
+; X64-BWON-NEXT: movzwl (%rdx,%r8), %r8d
+; X64-BWON-NEXT: movw %r8w, (%rsi,%rcx,2)
+; X64-BWON-NEXT: incq %rcx
+; X64-BWON-NEXT: cmpl %ecx, %eax
+; X64-BWON-NEXT: jne .LBB11_1
+; X64-BWON-NEXT: # %bb.2:
+; X64-BWON-NEXT: retq
+;
+; X64-BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
+; X64-BWOFF: # %bb.0:
+; X64-BWOFF-NEXT: movl %ecx, %eax
+; X64-BWOFF-NEXT: xorl %ecx, %ecx
+; X64-BWOFF-NEXT: .p2align 4, 0x90
+; X64-BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; X64-BWOFF-NEXT: movsbq (%rdi,%rcx), %r8
+; X64-BWOFF-NEXT: movw (%rdx,%r8), %r8w
+; X64-BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2)
+; X64-BWOFF-NEXT: incq %rcx
+; X64-BWOFF-NEXT: cmpl %ecx, %eax
+; X64-BWOFF-NEXT: jne .LBB11_1
+; X64-BWOFF-NEXT: # %bb.2:
+; X64-BWOFF-NEXT: retq
br label %1
; <label>:1
@@ -624,43 +1119,127 @@ define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
; However, we can only merge ignore sign extensions when they are on all memory
; computations;
define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
-; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
-; BWON: # %bb.0:
-; BWON-NEXT: movl %ecx, %eax
-; BWON-NEXT: xorl %ecx, %ecx
-; BWON-NEXT: .p2align 4, 0x90
-; BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
-; BWON-NEXT: movsbq (%rdi,%rcx), %r8
-; BWON-NEXT: movzbl (%rdx,%r8), %r9d
-; BWON-NEXT: incl %r8d
-; BWON-NEXT: movsbq %r8b, %r8
-; BWON-NEXT: movzbl (%rdx,%r8), %r8d
-; BWON-NEXT: movb %r9b, (%rsi,%rcx,2)
-; BWON-NEXT: movb %r8b, 1(%rsi,%rcx,2)
-; BWON-NEXT: incq %rcx
-; BWON-NEXT: cmpl %ecx, %eax
-; BWON-NEXT: jne .LBB12_1
-; BWON-NEXT: # %bb.2:
-; BWON-NEXT: retq
-;
-; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
-; BWOFF: # %bb.0:
-; BWOFF-NEXT: movl %ecx, %eax
-; BWOFF-NEXT: xorl %ecx, %ecx
-; BWOFF-NEXT: .p2align 4, 0x90
-; BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
-; BWOFF-NEXT: movsbq (%rdi,%rcx), %r8
-; BWOFF-NEXT: movb (%rdx,%r8), %r9b
-; BWOFF-NEXT: incl %r8d
-; BWOFF-NEXT: movsbq %r8b, %r8
-; BWOFF-NEXT: movb (%rdx,%r8), %r8b
-; BWOFF-NEXT: movb %r9b, (%rsi,%rcx,2)
-; BWOFF-NEXT: movb %r8b, 1(%rsi,%rcx,2)
-; BWOFF-NEXT: incq %rcx
-; BWOFF-NEXT: cmpl %ecx, %eax
-; BWOFF-NEXT: jne .LBB12_1
-; BWOFF-NEXT: # %bb.2:
-; BWOFF-NEXT: retq
+; X86-BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
+; X86-BWON: # %bb.0:
+; X86-BWON-NEXT: pushl %ebp
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: pushl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 12
+; X86-BWON-NEXT: pushl %edi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 16
+; X86-BWON-NEXT: pushl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 20
+; X86-BWON-NEXT: .cfi_offset %esi, -20
+; X86-BWON-NEXT: .cfi_offset %edi, -16
+; X86-BWON-NEXT: .cfi_offset %ebx, -12
+; X86-BWON-NEXT: .cfi_offset %ebp, -8
+; X86-BWON-NEXT: xorl %eax, %eax
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BWON-NEXT: .p2align 4, 0x90
+; X86-BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
+; X86-BWON-NEXT: movsbl (%edi,%eax), %ebx
+; X86-BWON-NEXT: movzbl (%edx,%ebx), %ecx
+; X86-BWON-NEXT: incb %bl
+; X86-BWON-NEXT: movsbl %bl, %ebx
+; X86-BWON-NEXT: movb (%edx,%ebx), %ch
+; X86-BWON-NEXT: movb %cl, (%esi,%eax,2)
+; X86-BWON-NEXT: movb %ch, 1(%esi,%eax,2)
+; X86-BWON-NEXT: incl %eax
+; X86-BWON-NEXT: cmpl %eax, %ebp
+; X86-BWON-NEXT: jne .LBB12_1
+; X86-BWON-NEXT: # %bb.2:
+; X86-BWON-NEXT: popl %esi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 16
+; X86-BWON-NEXT: popl %edi
+; X86-BWON-NEXT: .cfi_def_cfa_offset 12
+; X86-BWON-NEXT: popl %ebx
+; X86-BWON-NEXT: .cfi_def_cfa_offset 8
+; X86-BWON-NEXT: popl %ebp
+; X86-BWON-NEXT: .cfi_def_cfa_offset 4
+; X86-BWON-NEXT: retl
+;
+; X86-BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
+; X86-BWOFF: # %bb.0:
+; X86-BWOFF-NEXT: pushl %ebp
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: pushl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
+; X86-BWOFF-NEXT: pushl %edi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
+; X86-BWOFF-NEXT: pushl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 20
+; X86-BWOFF-NEXT: .cfi_offset %esi, -20
+; X86-BWOFF-NEXT: .cfi_offset %edi, -16
+; X86-BWOFF-NEXT: .cfi_offset %ebx, -12
+; X86-BWOFF-NEXT: .cfi_offset %ebp, -8
+; X86-BWOFF-NEXT: xorl %eax, %eax
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BWOFF-NEXT: .p2align 4, 0x90
+; X86-BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
+; X86-BWOFF-NEXT: movsbl (%edi,%eax), %ebx
+; X86-BWOFF-NEXT: movb (%edx,%ebx), %cl
+; X86-BWOFF-NEXT: incb %bl
+; X86-BWOFF-NEXT: movsbl %bl, %ebx
+; X86-BWOFF-NEXT: movb (%edx,%ebx), %ch
+; X86-BWOFF-NEXT: movb %cl, (%esi,%eax,2)
+; X86-BWOFF-NEXT: movb %ch, 1(%esi,%eax,2)
+; X86-BWOFF-NEXT: incl %eax
+; X86-BWOFF-NEXT: cmpl %eax, %ebp
+; X86-BWOFF-NEXT: jne .LBB12_1
+; X86-BWOFF-NEXT: # %bb.2:
+; X86-BWOFF-NEXT: popl %esi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
+; X86-BWOFF-NEXT: popl %edi
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
+; X86-BWOFF-NEXT: popl %ebx
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
+; X86-BWOFF-NEXT: popl %ebp
+; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
+; X86-BWOFF-NEXT: retl
+;
+; X64-BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
+; X64-BWON: # %bb.0:
+; X64-BWON-NEXT: movl %ecx, %eax
+; X64-BWON-NEXT: xorl %ecx, %ecx
+; X64-BWON-NEXT: .p2align 4, 0x90
+; X64-BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
+; X64-BWON-NEXT: movsbq (%rdi,%rcx), %r8
+; X64-BWON-NEXT: movzbl (%rdx,%r8), %r9d
+; X64-BWON-NEXT: incl %r8d
+; X64-BWON-NEXT: movsbq %r8b, %r8
+; X64-BWON-NEXT: movzbl (%rdx,%r8), %r8d
+; X64-BWON-NEXT: movb %r9b, (%rsi,%rcx,2)
+; X64-BWON-NEXT: movb %r8b, 1(%rsi,%rcx,2)
+; X64-BWON-NEXT: incq %rcx
+; X64-BWON-NEXT: cmpl %ecx, %eax
+; X64-BWON-NEXT: jne .LBB12_1
+; X64-BWON-NEXT: # %bb.2:
+; X64-BWON-NEXT: retq
+;
+; X64-BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
+; X64-BWOFF: # %bb.0:
+; X64-BWOFF-NEXT: movl %ecx, %eax
+; X64-BWOFF-NEXT: xorl %ecx, %ecx
+; X64-BWOFF-NEXT: .p2align 4, 0x90
+; X64-BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
+; X64-BWOFF-NEXT: movsbq (%rdi,%rcx), %r8
+; X64-BWOFF-NEXT: movb (%rdx,%r8), %r9b
+; X64-BWOFF-NEXT: incl %r8d
+; X64-BWOFF-NEXT: movsbq %r8b, %r8
+; X64-BWOFF-NEXT: movb (%rdx,%r8), %r8b
+; X64-BWOFF-NEXT: movb %r9b, (%rsi,%rcx,2)
+; X64-BWOFF-NEXT: movb %r8b, 1(%rsi,%rcx,2)
+; X64-BWOFF-NEXT: incq %rcx
+; X64-BWOFF-NEXT: cmpl %ecx, %eax
+; X64-BWOFF-NEXT: jne .LBB12_1
+; X64-BWOFF-NEXT: # %bb.2:
+; X64-BWOFF-NEXT: retq
br label %1
; <label>:1
@@ -690,11 +1269,18 @@ define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
-; CHECK-LABEL: merge_vec_element_store:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovups %ymm0, (%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X86-LABEL: merge_vec_element_store:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovups %ymm0, (%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_vec_element_store:
+; X64: # %bb.0:
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%vecext0 = extractelement <8 x float> %v, i32 0
%vecext1 = extractelement <8 x float> %v, i32 1
%vecext2 = extractelement <8 x float> %v, i32 2
@@ -725,12 +1311,20 @@ define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
; PR21711 - Merge vector stores into wider vector stores.
; These should be merged into 32-byte stores.
define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
-; CHECK-LABEL: merge_vec_extract_stores:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovups %ymm0, 48(%rdi)
-; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X86-LABEL: merge_vec_extract_stores:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovups %ymm0, 48(%eax)
+; X86-NEXT: vmovups %ymm1, 80(%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_vec_extract_stores:
+; X64: # %bb.0:
+; X64-NEXT: vmovups %ymm0, 48(%rdi)
+; X64-NEXT: vmovups %ymm1, 80(%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
%idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
@@ -749,12 +1343,21 @@ define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x flo
; Merging vector stores when sourced from vector loads.
define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
-; CHECK-LABEL: merge_vec_stores_from_loads:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovups (%rdi), %ymm0
-; CHECK-NEXT: vmovups %ymm0, (%rsi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X86-LABEL: merge_vec_stores_from_loads:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: vmovups (%ecx), %ymm0
+; X86-NEXT: vmovups %ymm0, (%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_vec_stores_from_loads:
+; X64: # %bb.0:
+; X64-NEXT: vmovups (%rdi), %ymm0
+; X64-NEXT: vmovups %ymm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0
%load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1
%v0 = load <4 x float>, <4 x float>* %load_idx0
@@ -768,12 +1371,20 @@ define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
}
define void @merge_vec_stores_of_zero(<4 x i32>* %ptr) {
-; CHECK-LABEL: merge_vec_stores_of_zero:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %ymm0, 48(%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X86-LABEL: merge_vec_stores_of_zero:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X86-NEXT: vmovups %ymm0, 48(%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_vec_stores_of_zero:
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X64-NEXT: vmovups %ymm0, 48(%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
store <4 x i32> zeroinitializer, <4 x i32>* %idx0, align 16
@@ -782,12 +1393,20 @@ define void @merge_vec_stores_of_zero(<4 x i32>* %ptr) {
}
define void @merge_vec_stores_of_constant_splat(<4 x i32>* %ptr) {
-; CHECK-LABEL: merge_vec_stores_of_constant_splat:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
-; CHECK-NEXT: vmovaps %xmm0, 48(%rdi)
-; CHECK-NEXT: vmovaps %xmm0, 64(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: merge_vec_stores_of_constant_splat:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
+; X86-NEXT: vmovaps %xmm0, 48(%eax)
+; X86-NEXT: vmovaps %xmm0, 64(%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_vec_stores_of_constant_splat:
+; X64: # %bb.0:
+; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
+; X64-NEXT: vmovaps %xmm0, 48(%rdi)
+; X64-NEXT: vmovaps %xmm0, 64(%rdi)
+; X64-NEXT: retq
%idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %idx0, align 16
@@ -796,13 +1415,22 @@ define void @merge_vec_stores_of_constant_splat(<4 x i32>* %ptr) {
}
define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
-; CHECK-LABEL: merge_vec_stores_of_constants:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [25,51,45,0]
-; CHECK-NEXT: vmovaps %xmm0, 48(%rdi)
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,265,26,0]
-; CHECK-NEXT: vmovaps %xmm0, 64(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: merge_vec_stores_of_constants:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovaps {{.*#+}} xmm0 = [25,51,45,0]
+; X86-NEXT: vmovaps %xmm0, 48(%eax)
+; X86-NEXT: vmovaps {{.*#+}} xmm0 = [0,265,26,0]
+; X86-NEXT: vmovaps %xmm0, 64(%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_vec_stores_of_constants:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = [25,51,45,0]
+; X64-NEXT: vmovaps %xmm0, 48(%rdi)
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,265,26,0]
+; X64-NEXT: vmovaps %xmm0, 64(%rdi)
+; X64-NEXT: retq
%idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
store <4 x i32> <i32 25, i32 51, i32 45, i32 0>, <4 x i32>* %idx0, align 16
@@ -811,12 +1439,20 @@ define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
}
define void @merge_vec_stores_of_constants_with_undefs(<4 x i32>* %ptr) {
-; CHECK-LABEL: merge_vec_stores_of_constants_with_undefs:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %ymm0, 48(%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X86-LABEL: merge_vec_stores_of_constants_with_undefs:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X86-NEXT: vmovups %ymm0, 48(%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_vec_stores_of_constants_with_undefs:
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X64-NEXT: vmovups %ymm0, 48(%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
store <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, <4 x i32>* %idx0, align 16
@@ -827,11 +1463,22 @@ define void @merge_vec_stores_of_constants_with_undefs(<4 x i32>* %ptr) {
; This is a minimized test based on real code that was failing.
; This should now be merged.
define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
-; CHECK-LABEL: merge_vec_element_and_scalar_load:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovups (%rdi), %xmm0
-; CHECK-NEXT: vmovups %xmm0, 32(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: merge_vec_element_and_scalar_load:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl (%eax), %ecx
+; X86-NEXT: movl 4(%eax), %edx
+; X86-NEXT: movl %edx, 36(%eax)
+; X86-NEXT: movl %ecx, 32(%eax)
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vmovsd %xmm0, 40(%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_vec_element_and_scalar_load:
+; X64: # %bb.0:
+; X64-NEXT: vmovups (%rdi), %xmm0
+; X64-NEXT: vmovups %xmm0, 32(%rdi)
+; X64-NEXT: retq
%idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
%idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
%idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
@@ -850,12 +1497,20 @@ define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
; Don't let a non-consecutive store thwart merging of the last two.
define void @almost_consecutive_stores(i8* %p) {
-; CHECK-LABEL: almost_consecutive_stores:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movb $0, (%rdi)
-; CHECK-NEXT: movb $1, 42(%rdi)
-; CHECK-NEXT: movw $770, 2(%rdi) # imm = 0x302
-; CHECK-NEXT: retq
+; X86-LABEL: almost_consecutive_stores:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movb $0, (%eax)
+; X86-NEXT: movb $1, 42(%eax)
+; X86-NEXT: movw $770, 2(%eax) # imm = 0x302
+; X86-NEXT: retl
+;
+; X64-LABEL: almost_consecutive_stores:
+; X64: # %bb.0:
+; X64-NEXT: movb $0, (%rdi)
+; X64-NEXT: movb $1, 42(%rdi)
+; X64-NEXT: movw $770, 2(%rdi) # imm = 0x302
+; X64-NEXT: retq
store i8 0, i8* %p
%p1 = getelementptr i8, i8* %p, i64 42
store i8 1, i8* %p1
@@ -868,10 +1523,16 @@ define void @almost_consecutive_stores(i8* %p) {
; We should be able to merge these.
define void @merge_bitcast(<4 x i32> %v, float* %ptr) {
-; CHECK-LABEL: merge_bitcast:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovups %xmm0, (%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: merge_bitcast:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovups %xmm0, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_bitcast:
+; X64: # %bb.0:
+; X64-NEXT: vmovups %xmm0, (%rdi)
+; X64-NEXT: retq
%fv = bitcast <4 x i32> %v to <4 x float>
%vecext1 = extractelement <4 x i32> %v, i32 1
%vecext2 = extractelement <4 x i32> %v, i32 2
@@ -893,21 +1554,39 @@ define void @merge_bitcast(<4 x i32> %v, float* %ptr) {
; same as @merge_const_store with heterogeneous types.
define void @merge_const_store_heterogeneous(i32 %count, %struct.C* nocapture %p) nounwind uwtable noinline ssp {
-; CHECK-LABEL: merge_const_store_heterogeneous:
-; CHECK: # %bb.0:
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jle .LBB23_3
-; CHECK-NEXT: # %bb.1: # %.lr.ph.preheader
-; CHECK-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: .LBB23_2: # %.lr.ph
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movq %rax, (%rsi)
-; CHECK-NEXT: addq $24, %rsi
-; CHECK-NEXT: decl %edi
-; CHECK-NEXT: jne .LBB23_2
-; CHECK-NEXT: .LBB23_3: # %._crit_edge
-; CHECK-NEXT: retq
+; X86-LABEL: merge_const_store_heterogeneous:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: jle .LBB23_3
+; X86-NEXT: # %bb.1: # %.lr.ph.preheader
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB23_2: # %.lr.ph
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl $67305985, (%ecx) # imm = 0x4030201
+; X86-NEXT: movl $134678021, 4(%ecx) # imm = 0x8070605
+; X86-NEXT: addl $24, %ecx
+; X86-NEXT: decl %eax
+; X86-NEXT: jne .LBB23_2
+; X86-NEXT: .LBB23_3: # %._crit_edge
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_const_store_heterogeneous:
+; X64: # %bb.0:
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: jle .LBB23_3
+; X64-NEXT: # %bb.1: # %.lr.ph.preheader
+; X64-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB23_2: # %.lr.ph
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: movq %rax, (%rsi)
+; X64-NEXT: addq $24, %rsi
+; X64-NEXT: decl %edi
+; X64-NEXT: jne .LBB23_2
+; X64-NEXT: .LBB23_3: # %._crit_edge
+; X64-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
@@ -933,11 +1612,21 @@ define void @merge_const_store_heterogeneous(i32 %count, %struct.C* nocapture %p
; Merging heterogeneous integer types.
define void @merge_heterogeneous(%struct.C* nocapture %p, %struct.C* nocapture %q) {
-; CHECK-LABEL: merge_heterogeneous:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq %rax, (%rsi)
-; CHECK-NEXT: retq
+; X86-LABEL: merge_heterogeneous:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: movl 4(%ecx), %ecx
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_heterogeneous:
+; X64: # %bb.0:
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: movq %rax, (%rsi)
+; X64-NEXT: retq
%s0 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 0
%s1 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 1
%s2 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 2
@@ -962,11 +1651,19 @@ define void @merge_heterogeneous(%struct.C* nocapture %p, %struct.C* nocapture %
}
define i32 @merge_store_load_store_seq(i32* %buff) {
-; CHECK-LABEL: merge_store_load_store_seq:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movl 4(%rdi), %eax
-; CHECK-NEXT: movq $0, (%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: merge_store_load_store_seq:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $0, (%ecx)
+; X86-NEXT: movl 4(%ecx), %eax
+; X86-NEXT: movl $0, 4(%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_store_load_store_seq:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movl 4(%rdi), %eax
+; X64-NEXT: movq $0, (%rdi)
+; X64-NEXT: retq
entry:
store i32 0, i32* %buff, align 4
@@ -977,12 +1674,21 @@ entry:
}
define i32 @merge_store_alias(i32* %buff, i32* %other) {
-; CHECK-LABEL: merge_store_alias:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movl $0, (%rdi)
-; CHECK-NEXT: movl (%rsi), %eax
-; CHECK-NEXT: movl $0, 4(%rdi)
-; CHECK-NEXT: retq
+; X86-LABEL: merge_store_alias:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $0, (%ecx)
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: movl $0, 4(%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: merge_store_alias:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movl $0, (%rdi)
+; X64-NEXT: movl (%rsi), %eax
+; X64-NEXT: movl $0, 4(%rdi)
+; X64-NEXT: retq
entry:
store i32 0, i32* %buff, align 4
More information about the llvm-commits
mailing list