[llvm] d5d498f - [X86][AMX] Simplify AMX test case.
via llvm-commits
llvm-commits at lists.llvm.org
Sun May 8 04:13:05 PDT 2022
Author: Luo, Yuanke
Date: 2022-05-08T19:12:54+08:00
New Revision: d5d498f9baae218c56dc3a3582ef0083f795f088
URL: https://github.com/llvm/llvm-project/commit/d5d498f9baae218c56dc3a3582ef0083f795f088
DIFF: https://github.com/llvm/llvm-project/commit/d5d498f9baae218c56dc3a3582ef0083f795f088.diff
LOG: [X86][AMX] Simplify AMX test case.
Extract test for zero tile configure into a small test case.
Added:
llvm/test/CodeGen/X86/AMX/amx-zero-config.ll
Modified:
llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
index f7089e98fcfea..0bc849db31a8b 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
@@ -1,25 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
-
-
-source_filename = "amx_api.c"
%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
-
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
-; Function Attrs: noinline nounwind optnone uwtable
define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 {
; AVX512-LABEL: test_api:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: .cfi_def_cfa_offset 16
-; AVX512-NEXT: .cfi_offset %rbp, -16
; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: .cfi_def_cfa_register %rbp
; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00
; AVX512-NEXT: subq $25600, %rsp # imm = 0x6400
; AVX512-NEXT: movw %dx, %ax
@@ -665,3537 +655,9 @@ define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
; AVX512-NEXT: movq %rbp, %rsp
; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: .cfi_def_cfa %rsp, 8
; AVX512-NEXT: tilerelease
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
-;
-; AVX2-LABEL: test_api:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00
-; AVX2-NEXT: subq $29696, %rsp # imm = 0x7400
-; AVX2-NEXT: movw %dx, %ax
-; AVX2-NEXT: movw %si, %cx
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: movl $1088, %edx # imm = 0x440
-; AVX2-NEXT: callq memset at PLT
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: movl $1088, %edx # imm = 0x440
-; AVX2-NEXT: callq memset at PLT
-; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: movl $1088, %edx # imm = 0x440
-; AVX2-NEXT: callq memset at PLT
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: je .LBB0_2
-; AVX2-NEXT: # %bb.1: # %if.then
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw (%rax), %si
-; AVX2-NEXT: movw 2(%rax), %dx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %dil
-; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: addq $64, %rdx
-; AVX2-NEXT: movl $64, %esi
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw (%rax), %di
-; AVX2-NEXT: movw 2(%rax), %dx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %r8b
-; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: addq $64, %rdx
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw (%rax), %si
-; AVX2-NEXT: movw 2(%rax), %dx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %r8b
-; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg (%rdi)
-; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: addq $64, %rdx
-; AVX2-NEXT: movl $64, %esi
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: jmp .LBB0_3
-; AVX2-NEXT: .LBB0_2: # %if.else
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw (%rax), %si
-; AVX2-NEXT: movw 2(%rax), %dx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %dil
-; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: addq $64, %rdx
-; AVX2-NEXT: movl $64, %esi
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw (%rax), %di
-; AVX2-NEXT: movw 2(%rax), %dx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %r8b
-; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: tileloadd (%rdx,%rdi), %tmm0
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: addq $64, %rdx
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw (%rax), %si
-; AVX2-NEXT: movw 2(%rax), %dx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %r8b
-; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg (%rdi)
-; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: addq $64, %rdx
-; AVX2-NEXT: movl $64, %esi
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: .LBB0_3: # %if.end
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: movl $1088, %edx # imm = 0x440
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: callq memcpy at PLT
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: callq memcpy at PLT
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vmovaps 64(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 96(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 128(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 160(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 192(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 224(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 256(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 288(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 320(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 352(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 384(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 416(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 448(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 480(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 512(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 544(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 576(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 608(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 640(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 672(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 704(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 736(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 768(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 800(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 832(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 864(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 896(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 928(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 960(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 992(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 1024(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps 1056(%rax), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: movl $1024, %edx # imm = 0x400
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: callq memcpy at PLT
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: callq memcpy at PLT
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: callq memcpy at PLT
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: # kill: def $r8 killed $rax
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
-; AVX2-NEXT: movw %r10w, %di
-; AVX2-NEXT: shrl $2, %r10d
-; AVX2-NEXT: movw %r10w, %r9w
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %r8b
-; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %di, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: # kill: def $r10b killed $r10b killed $r10d
-; AVX2-NEXT: movb %r10b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl $64, %r8d
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: tileloadd (%r10,%r8), %tmm0
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: tileloadd (%r10,%r8), %tmm1
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: tileloadd (%r10,%r8), %tmm2
-; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: addq $64, %rdi
-; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: callq memcpy at PLT
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: callq memcpy at PLT
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: # kill: def $rdi killed $rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm2
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm3
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm4
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm5
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm6
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm7
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm8
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm9
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm10
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm11
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm12
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm13
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm14
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm15
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %r9b
-; AVX2-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg (%r8)
-; AVX2-NEXT: movl $64, %r8d
-; AVX2-NEXT: tileloadd (%rdi,%r8), %tmm0
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: .cfi_def_cfa %rsp, 8
-; AVX2-NEXT: tilerelease
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; SSE2-LABEL: test_api:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pushq %rbp
-; SSE2-NEXT: .cfi_def_cfa_offset 16
-; SSE2-NEXT: .cfi_offset %rbp, -16
-; SSE2-NEXT: movq %rsp, %rbp
-; SSE2-NEXT: .cfi_def_cfa_register %rbp
-; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00
-; SSE2-NEXT: subq $30720, %rsp # imm = 0x7800
-; SSE2-NEXT: movw %dx, %ax
-; SSE2-NEXT: movw %si, %cx
-; SSE2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: xorl %esi, %esi
-; SSE2-NEXT: movl $1088, %edx # imm = 0x440
-; SSE2-NEXT: callq memset at PLT
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: xorl %esi, %esi
-; SSE2-NEXT: movl $1088, %edx # imm = 0x440
-; SSE2-NEXT: callq memset at PLT
-; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: xorl %esi, %esi
-; SSE2-NEXT: movl $1088, %edx # imm = 0x440
-; SSE2-NEXT: callq memset at PLT
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: je .LBB0_2
-; SSE2-NEXT: # %bb.1: # %if.then
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw (%rax), %si
-; SSE2-NEXT: movw 2(%rax), %dx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %dil
-; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: addq $64, %rdx
-; SSE2-NEXT: movl $64, %esi
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw (%rax), %di
-; SSE2-NEXT: movw 2(%rax), %dx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %r8b
-; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: addq $64, %rdx
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw (%rax), %si
-; SSE2-NEXT: movw 2(%rax), %dx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %r8b
-; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg (%rdi)
-; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: addq $64, %rdx
-; SSE2-NEXT: movl $64, %esi
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: jmp .LBB0_3
-; SSE2-NEXT: .LBB0_2: # %if.else
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw (%rax), %si
-; SSE2-NEXT: movw 2(%rax), %dx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %dil
-; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: addq $64, %rdx
-; SSE2-NEXT: movl $64, %esi
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw (%rax), %di
-; SSE2-NEXT: movw 2(%rax), %dx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %r8b
-; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: tileloadd (%rdx,%rdi), %tmm0
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: addq $64, %rdx
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $buf2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw (%rax), %si
-; SSE2-NEXT: movw 2(%rax), %dx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %r8b
-; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg (%rdi)
-; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: addq $64, %rdx
-; SSE2-NEXT: movl $64, %esi
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: .LBB0_3: # %if.end
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: movl $1088, %edx # imm = 0x440
-; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: callq memcpy at PLT
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: callq memcpy at PLT
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movaps 64(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 80(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 96(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 112(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 128(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 144(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 160(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 176(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 192(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 208(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 224(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 240(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 256(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 272(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 288(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 304(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 320(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 336(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 352(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 368(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 384(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 400(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 416(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 432(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 448(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 464(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 480(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 496(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 512(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 528(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 544(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 560(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 576(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 592(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 608(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 624(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 640(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 656(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 672(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 688(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 704(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 720(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 736(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 752(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 768(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 784(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 800(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 816(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 832(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 848(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 864(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 880(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 896(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 912(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 928(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 944(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 960(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 976(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 992(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 1008(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 1024(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 1040(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 1056(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps 1072(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: movl $1024, %edx # imm = 0x400
-; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: callq memcpy at PLT
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: callq memcpy at PLT
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: callq memcpy at PLT
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT: # kill: def $r8 killed $rax
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
-; SSE2-NEXT: movw %r10w, %di
-; SSE2-NEXT: shrl $2, %r10d
-; SSE2-NEXT: movw %r10w, %r9w
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %r8b
-; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %di, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: # kill: def $r10b killed $r10b killed $r10d
-; SSE2-NEXT: movb %r10b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movl $64, %r8d
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: tileloadd (%r10,%r8), %tmm0
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: tileloadd (%r10,%r8), %tmm1
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: tileloadd (%r10,%r8), %tmm2
-; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: addq $64, %rdi
-; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: callq memcpy at PLT
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT: movq $buf, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: callq memcpy at PLT
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE2-NEXT: # kill: def $rdi killed $rax
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT: movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %r9b
-; SSE2-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg (%r8)
-; SSE2-NEXT: movl $64, %r8d
-; SSE2-NEXT: tileloadd (%rdi,%r8), %tmm0
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: movq %rbp, %rsp
-; SSE2-NEXT: popq %rbp
-; SSE2-NEXT: .cfi_def_cfa %rsp, 8
-; SSE2-NEXT: tilerelease
-; SSE2-NEXT: retq
entry:
%m.addr.i85 = alloca i16, align 2
%n.addr.i86 = alloca i16, align 2
@@ -4538,22 +1000,13 @@ if.end: ; preds = %if.else, %if.then
ret void
}
-; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1
-
-; Function Attrs: nounwind
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #2
-
-; Function Attrs: nounwind
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2
-
-; Function Attrs: nounwind
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #2
-
-; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3
-attributes #0 = { noinline nounwind optnone uwtable }
+attributes #0 = { noinline nounwind optnone }
attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly }
attributes #2 = { nounwind }
attributes #3 = { argmemonly nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
index 0771d93e1a684..38c01f2f46cce 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
@@ -1,20 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
-; Function Attrs: nounwind uwtable
-define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
+define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) nounwind {
; AVX512-LABEL: test_api:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: .cfi_def_cfa_offset 16
-; AVX512-NEXT: .cfi_offset %rbp, -16
; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: .cfi_def_cfa_register %rbp
; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00
; AVX512-NEXT: subq $6144, %rsp # imm = 0x1800
; AVX512-NEXT: movw %dx, %ax
@@ -149,335 +143,9 @@ define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) l
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
; AVX512-NEXT: movq %rbp, %rsp
; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: .cfi_def_cfa %rsp, 8
; AVX512-NEXT: tilerelease
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
-;
-; AVX2-LABEL: test_api:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00
-; AVX2-NEXT: subq $6144, %rsp # imm = 0x1800
-; AVX2-NEXT: movw %dx, %ax
-; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT: movw %si, %ax
-; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: cmpl $0, %edi
-; AVX2-NEXT: je .LBB0_2
-; AVX2-NEXT: # %bb.1: # %if.then
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %sil
-; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl $buf, %r9d
-; AVX2-NEXT: movl $32, %r10d
-; AVX2-NEXT: movw $8, %si
-; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
-; AVX2-NEXT: movl $64, %r8d
-; AVX2-NEXT: tilestored %tmm0, (%r11,%r8)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
-; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %dil
-; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg (%rsi)
-; AVX2-NEXT: movl $buf, %esi
-; AVX2-NEXT: movl $32, %edi
-; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0
-; AVX2-NEXT: movl $64, %esi
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: jmp .LBB0_3
-; AVX2-NEXT: .LBB0_2: # %if.else
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %sil
-; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl $buf2, %r9d
-; AVX2-NEXT: movl $32, %r10d
-; AVX2-NEXT: movw $8, %si
-; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
-; AVX2-NEXT: movl $64, %r8d
-; AVX2-NEXT: tilestored %tmm0, (%r11,%r8)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0
-; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %dil
-; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg (%rsi)
-; AVX2-NEXT: movl $buf2, %esi
-; AVX2-NEXT: movl $32, %edi
-; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0
-; AVX2-NEXT: movl $64, %esi
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: .LBB0_3: # %if.end
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %sil
-; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl $64, %esi
-; AVX2-NEXT: movw $8, %di
-; AVX2-NEXT: tileloadd (%r10,%rsi), %tmm1
-; AVX2-NEXT: tileloadd (%r9,%rsi), %tmm2
-; AVX2-NEXT: tileloadd (%r8,%rsi), %tmm0
-; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movb %al, %dil
-; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: ldtilecfg (%rsi)
-; AVX2-NEXT: movl $64, %esi
-; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT: movl $buf, %edx
-; AVX2-NEXT: movl $32, %esi
-; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: .cfi_def_cfa %rsp, 8
-; AVX2-NEXT: tilerelease
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; SSE2-LABEL: test_api:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pushq %rbp
-; SSE2-NEXT: .cfi_def_cfa_offset 16
-; SSE2-NEXT: .cfi_offset %rbp, -16
-; SSE2-NEXT: movq %rsp, %rbp
-; SSE2-NEXT: .cfi_def_cfa_register %rbp
-; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00
-; SSE2-NEXT: subq $6144, %rsp # imm = 0x1800
-; SSE2-NEXT: movw %dx, %ax
-; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT: movw %si, %ax
-; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: cmpl $0, %edi
-; SSE2-NEXT: je .LBB0_2
-; SSE2-NEXT: # %bb.1: # %if.then
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %sil
-; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movl $buf, %r9d
-; SSE2-NEXT: movl $32, %r10d
-; SSE2-NEXT: movw $8, %si
-; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
-; SSE2-NEXT: movl $64, %r8d
-; SSE2-NEXT: tilestored %tmm0, (%r11,%r8)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
-; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %dil
-; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg (%rsi)
-; SSE2-NEXT: movl $buf, %esi
-; SSE2-NEXT: movl $32, %edi
-; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0
-; SSE2-NEXT: movl $64, %esi
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: jmp .LBB0_3
-; SSE2-NEXT: .LBB0_2: # %if.else
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %sil
-; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movl $buf2, %r9d
-; SSE2-NEXT: movl $32, %r10d
-; SSE2-NEXT: movw $8, %si
-; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
-; SSE2-NEXT: movl $64, %r8d
-; SSE2-NEXT: tilestored %tmm0, (%r11,%r8)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0
-; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %dil
-; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg (%rsi)
-; SSE2-NEXT: movl $buf2, %esi
-; SSE2-NEXT: movl $32, %edi
-; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0
-; SSE2-NEXT: movl $64, %esi
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: .LBB0_3: # %if.end
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %sil
-; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movl $64, %esi
-; SSE2-NEXT: movw $8, %di
-; SSE2-NEXT: tileloadd (%r10,%rsi), %tmm1
-; SSE2-NEXT: tileloadd (%r9,%rsi), %tmm2
-; SSE2-NEXT: tileloadd (%r8,%rsi), %tmm0
-; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movb %al, %dil
-; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT: ldtilecfg (%rsi)
-; SSE2-NEXT: movl $64, %esi
-; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT: movl $buf, %edx
-; SSE2-NEXT: movl $32, %esi
-; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT: movq %rbp, %rsp
-; SSE2-NEXT: popq %rbp
-; SSE2-NEXT: .cfi_def_cfa %rsp, 8
-; SSE2-NEXT: tilerelease
-; SSE2-NEXT: retq
entry:
%tobool.not = icmp eq i32 %cond, 0
br i1 %tobool.not, label %if.else, label %if.then
@@ -503,11 +171,6 @@ if.end: ; preds = %if.else, %if.then
ret void
}
-; Function Attrs: nounwind
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
-
-; Function Attrs: nounwind
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-
-; Function Attrs: nounwind
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll b/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll
new file mode 100644
index 0000000000000..a76a1add0676a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -O0 | FileCheck %s --check-prefix=AVX512-O0
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 -O0 | FileCheck %s --check-prefix=AVX2-O0
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -O0 | FileCheck %s --check-prefix=SSE2-O0
+
+define void @foo(i8 *%buf) nounwind {
+; AVX512-LABEL: foo:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw $32, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movw $32, %ax
+; AVX512-NEXT: movw $8, %cx
+; AVX512-NEXT: tilezero %tmm0
+; AVX512-NEXT: movl $1024, %edx # imm = 0x400
+; AVX512-NEXT: tilestored %tmm0, (%rdi,%rdx)
+; AVX512-NEXT: tilerelease
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX2-LABEL: foo:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw $32, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movw $32, %ax
+; AVX2-NEXT: movw $8, %cx
+; AVX2-NEXT: tilezero %tmm0
+; AVX2-NEXT: movl $1024, %edx # imm = 0x400
+; AVX2-NEXT: tilestored %tmm0, (%rdi,%rdx)
+; AVX2-NEXT: tilerelease
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; SSE2-LABEL: foo:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw $32, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movw $32, %ax
+; SSE2-NEXT: movw $8, %cx
+; SSE2-NEXT: tilezero %tmm0
+; SSE2-NEXT: movl $1024, %edx # imm = 0x400
+; SSE2-NEXT: tilestored %tmm0, (%rdi,%rdx)
+; SSE2-NEXT: tilerelease
+; SSE2-NEXT: retq
+;
+; AVX512-O0-LABEL: foo:
+; AVX512-O0: # %bb.0: # %entry
+; AVX512-O0-NEXT: pushq %rbp
+; AVX512-O0-NEXT: movq %rsp, %rbp
+; AVX512-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; AVX512-O0-NEXT: subq $2048, %rsp # imm = 0x800
+; AVX512-O0-NEXT: movq %rsp, %rdx
+; AVX512-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-O0-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT: movw $32, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT: movw $32, %cx
+; AVX512-O0-NEXT: movw $8, %ax
+; AVX512-O0-NEXT: tilezero %tmm0
+; AVX512-O0-NEXT: movl $64, %esi
+; AVX512-O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX512-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-O0-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT: movw $8, %cx
+; AVX512-O0-NEXT: # kill: def $cl killed $cl killed $cx
+; AVX512-O0-NEXT: movb %cl, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT: movw $32, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT: ldtilecfg (%rax)
+; AVX512-O0-NEXT: movl $64, %esi
+; AVX512-O0-NEXT: movw $32, %cx
+; AVX512-O0-NEXT: movw $8, %ax
+; AVX512-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX512-O0-NEXT: movl $1024, %edx # imm = 0x400
+; AVX512-O0-NEXT: movw $32, %cx
+; AVX512-O0-NEXT: movw $8, %ax
+; AVX512-O0-NEXT: tilestored %tmm0, (%rdi,%rdx)
+; AVX512-O0-NEXT: movq %rbp, %rsp
+; AVX512-O0-NEXT: popq %rbp
+; AVX512-O0-NEXT: tilerelease
+; AVX512-O0-NEXT: vzeroupper
+; AVX512-O0-NEXT: retq
+;
+; AVX2-O0-LABEL: foo:
+; AVX2-O0: # %bb.0: # %entry
+; AVX2-O0-NEXT: pushq %rbp
+; AVX2-O0-NEXT: movq %rsp, %rbp
+; AVX2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; AVX2-O0-NEXT: subq $2048, %rsp # imm = 0x800
+; AVX2-O0-NEXT: movq %rsp, %rdx
+; AVX2-O0-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: movw $32, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: movw $32, %cx
+; AVX2-O0-NEXT: movw $8, %ax
+; AVX2-O0-NEXT: tilezero %tmm0
+; AVX2-O0-NEXT: movl $64, %esi
+; AVX2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; AVX2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: movw $8, %cx
+; AVX2-O0-NEXT: # kill: def $cl killed $cl killed $cx
+; AVX2-O0-NEXT: movb %cl, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: movw $32, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT: ldtilecfg (%rax)
+; AVX2-O0-NEXT: movl $64, %esi
+; AVX2-O0-NEXT: movw $32, %cx
+; AVX2-O0-NEXT: movw $8, %ax
+; AVX2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; AVX2-O0-NEXT: movl $1024, %edx # imm = 0x400
+; AVX2-O0-NEXT: movw $32, %cx
+; AVX2-O0-NEXT: movw $8, %ax
+; AVX2-O0-NEXT: tilestored %tmm0, (%rdi,%rdx)
+; AVX2-O0-NEXT: movq %rbp, %rsp
+; AVX2-O0-NEXT: popq %rbp
+; AVX2-O0-NEXT: tilerelease
+; AVX2-O0-NEXT: vzeroupper
+; AVX2-O0-NEXT: retq
+;
+; SSE2-O0-LABEL: foo:
+; SSE2-O0: # %bb.0: # %entry
+; SSE2-O0-NEXT: pushq %rbp
+; SSE2-O0-NEXT: movq %rsp, %rbp
+; SSE2-O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
+; SSE2-O0-NEXT: subq $2048, %rsp # imm = 0x800
+; SSE2-O0-NEXT: movq %rsp, %rdx
+; SSE2-O0-NEXT: xorps %xmm0, %xmm0
+; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movb $8, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movw $32, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movw $32, %cx
+; SSE2-O0-NEXT: movw $8, %ax
+; SSE2-O0-NEXT: tilezero %tmm0
+; SSE2-O0-NEXT: movl $64, %esi
+; SSE2-O0-NEXT: tilestored %tmm0, (%rdx,%rsi)
+; SSE2-O0-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movw $8, %cx
+; SSE2-O0-NEXT: # kill: def $cl killed $cl killed $cx
+; SSE2-O0-NEXT: movb %cl, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: movw $32, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT: ldtilecfg (%rax)
+; SSE2-O0-NEXT: movl $64, %esi
+; SSE2-O0-NEXT: movw $32, %cx
+; SSE2-O0-NEXT: movw $8, %ax
+; SSE2-O0-NEXT: tileloadd (%rdx,%rsi), %tmm0
+; SSE2-O0-NEXT: movl $1024, %edx # imm = 0x400
+; SSE2-O0-NEXT: movw $32, %cx
+; SSE2-O0-NEXT: movw $8, %ax
+; SSE2-O0-NEXT: tilestored %tmm0, (%rdi,%rdx)
+; SSE2-O0-NEXT: movq %rbp, %rsp
+; SSE2-O0-NEXT: popq %rbp
+; SSE2-O0-NEXT: tilerelease
+; SSE2-O0-NEXT: retq
+entry:
+ %t = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 32, i8* %buf, i64 1024, x86_amx %t)
+ ret void
+}
+
+declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
More information about the llvm-commits
mailing list