[llvm] d5d498f - [X86][AMX] Simplify AMX test case.

via llvm-commits llvm-commits at lists.llvm.org
Sun May 8 04:13:05 PDT 2022


Author: Luo, Yuanke
Date: 2022-05-08T19:12:54+08:00
New Revision: d5d498f9baae218c56dc3a3582ef0083f795f088

URL: https://github.com/llvm/llvm-project/commit/d5d498f9baae218c56dc3a3582ef0083f795f088
DIFF: https://github.com/llvm/llvm-project/commit/d5d498f9baae218c56dc3a3582ef0083f795f088.diff

LOG: [X86][AMX] Simplify AMX test case.

Extract test for zero tile configure into a small test case.

Added: 
    llvm/test/CodeGen/X86/AMX/amx-zero-config.ll

Modified: 
    llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
    llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
index f7089e98fcfea..0bc849db31a8b 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll
@@ -1,25 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
-
-
-source_filename = "amx_api.c"
 
 %struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
-
 @buf = dso_local global [1024 x i8] zeroinitializer, align 16
 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
 
-; Function Attrs: noinline nounwind optnone uwtable
 define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 {
 ; AVX512-LABEL: test_api:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    .cfi_def_cfa_register %rbp
 ; AVX512-NEXT:    andq $-1024, %rsp # imm = 0xFC00
 ; AVX512-NEXT:    subq $25600, %rsp # imm = 0x6400
 ; AVX512-NEXT:    movw %dx, %ax
@@ -665,3537 +655,9 @@ define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #
 ; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
 ; AVX512-NEXT:    movq %rbp, %rsp
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa %rsp, 8
 ; AVX512-NEXT:    tilerelease
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-;
-; AVX2-LABEL: test_api:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:    .cfi_offset %rbp, -16
-; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:    .cfi_def_cfa_register %rbp
-; AVX2-NEXT:    andq $-1024, %rsp # imm = 0xFC00
-; AVX2-NEXT:    subq $29696, %rsp # imm = 0x7400
-; AVX2-NEXT:    movw %dx, %ax
-; AVX2-NEXT:    movw %si, %cx
-; AVX2-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    movl $1088, %edx # imm = 0x440
-; AVX2-NEXT:    callq memset at PLT
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    movl $1088, %edx # imm = 0x440
-; AVX2-NEXT:    callq memset at PLT
-; AVX2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    movl $1088, %edx # imm = 0x440
-; AVX2-NEXT:    callq memset at PLT
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    cmpl $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    je .LBB0_2
-; AVX2-NEXT:  # %bb.1: # %if.then
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $buf, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw (%rax), %si
-; AVX2-NEXT:    movw 2(%rax), %dx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %dil
-; AVX2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    addq $64, %rdx
-; AVX2-NEXT:    movl $64, %esi
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $buf, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw (%rax), %di
-; AVX2-NEXT:    movw 2(%rax), %dx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw %di, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %r8b
-; AVX2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    tileloadd (%rdx,%rdi), %tmm0
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    addq $64, %rdx
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $buf, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw (%rax), %si
-; AVX2-NEXT:    movw 2(%rax), %dx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %r8b
-; AVX2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg (%rdi)
-; AVX2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    addq $64, %rdx
-; AVX2-NEXT:    movl $64, %esi
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:    jmp .LBB0_3
-; AVX2-NEXT:  .LBB0_2: # %if.else
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $buf2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw (%rax), %si
-; AVX2-NEXT:    movw 2(%rax), %dx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %dil
-; AVX2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    addq $64, %rdx
-; AVX2-NEXT:    movl $64, %esi
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $buf2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw (%rax), %di
-; AVX2-NEXT:    movw 2(%rax), %dx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw %di, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %r8b
-; AVX2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    tileloadd (%rdx,%rdi), %tmm0
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    addq $64, %rdx
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $buf2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw (%rax), %si
-; AVX2-NEXT:    movw 2(%rax), %dx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %r8b
-; AVX2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg (%rdi)
-; AVX2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    addq $64, %rdx
-; AVX2-NEXT:    movl $64, %esi
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:  .LBB0_3: # %if.end
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    movl $1088, %edx # imm = 0x440
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    callq memcpy at PLT
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    callq memcpy at PLT
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovaps 64(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 96(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 128(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 160(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 192(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 224(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 256(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 288(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 320(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 352(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 384(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 416(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 448(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 480(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 512(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 544(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 576(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 608(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 640(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 672(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 704(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 736(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 768(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 800(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 832(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 864(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 896(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 928(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 960(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 992(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 1024(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps 1056(%rax), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm2
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm3
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm4
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm5
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm6
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm7
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm8
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm9
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm10
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm11
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm12
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm13
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm14
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm15
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm3, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    movl $1024, %edx # imm = 0x400
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    callq memcpy at PLT
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    callq memcpy at PLT
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    callq memcpy at PLT
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT:    # kill: def $r8 killed $rax
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm2
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm3
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm4
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm5
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm6
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm7
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm8
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm9
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm10
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm11
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm12
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm13
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm14
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm15
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    movw %di, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm3, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; AVX2-NEXT:    movw %r10w, %di
-; AVX2-NEXT:    shrl $2, %r10d
-; AVX2-NEXT:    movw %r10w, %r9w
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %r8b
-; AVX2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %di, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    # kill: def $r10b killed $r10b killed $r10d
-; AVX2-NEXT:    movb %r10b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl $64, %r8d
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    tileloadd (%r10,%r8), %tmm0
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    tileloadd (%r10,%r8), %tmm1
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    tileloadd (%r10,%r8), %tmm2
-; AVX2-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    addq $64, %rdi
-; AVX2-NEXT:    tilestored %tmm0, (%rdi,%r8)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    callq memcpy at PLT
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT:    movq $buf, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm2
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm3
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm4
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm5
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm6
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm7
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm8
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm9
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm10
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm11
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm12
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm13
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm14
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm15
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm3, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    callq memcpy at PLT
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    # kill: def $rdi killed $rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm2
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm3
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm4
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm5
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm6
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm7
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm8
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm9
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm10
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm11
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm12
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm13
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm14
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm15
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm0
-; AVX2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT:    vmovaps %ymm15, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm14, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm13, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm12, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm11, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm10, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm9, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm7, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm6, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm5, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm4, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm3, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm2, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; AVX2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %r9b
-; AVX2-NEXT:    movb %r9b, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg (%r8)
-; AVX2-NEXT:    movl $64, %r8d
-; AVX2-NEXT:    tileloadd (%rdi,%r8), %tmm0
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:    movq %rbp, %rsp
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa %rsp, 8
-; AVX2-NEXT:    tilerelease
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; SSE2-LABEL: test_api:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
-; SSE2-NEXT:    .cfi_offset %rbp, -16
-; SSE2-NEXT:    movq %rsp, %rbp
-; SSE2-NEXT:    .cfi_def_cfa_register %rbp
-; SSE2-NEXT:    andq $-1024, %rsp # imm = 0xFC00
-; SSE2-NEXT:    subq $30720, %rsp # imm = 0x7800
-; SSE2-NEXT:    movw %dx, %ax
-; SSE2-NEXT:    movw %si, %cx
-; SSE2-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    xorl %esi, %esi
-; SSE2-NEXT:    movl $1088, %edx # imm = 0x440
-; SSE2-NEXT:    callq memset at PLT
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    xorl %esi, %esi
-; SSE2-NEXT:    movl $1088, %edx # imm = 0x440
-; SSE2-NEXT:    callq memset at PLT
-; SSE2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    xorl %esi, %esi
-; SSE2-NEXT:    movl $1088, %edx # imm = 0x440
-; SSE2-NEXT:    callq memset at PLT
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    cmpl $0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    je .LBB0_2
-; SSE2-NEXT:  # %bb.1: # %if.then
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $buf, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw (%rax), %si
-; SSE2-NEXT:    movw 2(%rax), %dx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %dil
-; SSE2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    addq $64, %rdx
-; SSE2-NEXT:    movl $64, %esi
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $buf, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw (%rax), %di
-; SSE2-NEXT:    movw 2(%rax), %dx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw %di, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %r8b
-; SSE2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    tileloadd (%rdx,%rdi), %tmm0
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    addq $64, %rdx
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $buf, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw (%rax), %si
-; SSE2-NEXT:    movw 2(%rax), %dx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %r8b
-; SSE2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg (%rdi)
-; SSE2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    addq $64, %rdx
-; SSE2-NEXT:    movl $64, %esi
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:    jmp .LBB0_3
-; SSE2-NEXT:  .LBB0_2: # %if.else
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $buf2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw (%rax), %si
-; SSE2-NEXT:    movw 2(%rax), %dx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %dil
-; SSE2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    addq $64, %rdx
-; SSE2-NEXT:    movl $64, %esi
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $buf2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw (%rax), %di
-; SSE2-NEXT:    movw 2(%rax), %dx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw %di, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %r8b
-; SSE2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    tileloadd (%rdx,%rdi), %tmm0
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    addq $64, %rdx
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $buf2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw (%rax), %si
-; SSE2-NEXT:    movw 2(%rax), %dx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %r8b
-; SSE2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg (%rdi)
-; SSE2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    addq $64, %rdx
-; SSE2-NEXT:    movl $64, %esi
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:  .LBB0_3: # %if.end
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    movl $1088, %edx # imm = 0x440
-; SSE2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    callq memcpy at PLT
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    callq memcpy at PLT
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movaps 64(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 80(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 96(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 112(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 128(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 144(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 160(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 176(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 192(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 208(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 224(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 240(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 256(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 272(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 288(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 304(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 320(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 336(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 352(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 368(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 384(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 400(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 416(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 432(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 448(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 464(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 480(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 496(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 512(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 528(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 544(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 560(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 576(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 592(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 608(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 624(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 640(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 656(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 672(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 688(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 704(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 720(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 736(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 752(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 768(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 784(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 800(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 816(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 832(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 848(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 864(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 880(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 896(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 912(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 928(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 944(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 960(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 976(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 992(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 1008(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 1024(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 1040(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 1056(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps 1072(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    movl $1024, %edx # imm = 0x400
-; SSE2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    callq memcpy at PLT
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    callq memcpy at PLT
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    callq memcpy at PLT
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT:    # kill: def $r8 killed $rax
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movw %di, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; SSE2-NEXT:    movw %r10w, %di
-; SSE2-NEXT:    shrl $2, %r10d
-; SSE2-NEXT:    movw %r10w, %r9w
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %r8b
-; SSE2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %di, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    # kill: def $r10b killed $r10b killed $r10d
-; SSE2-NEXT:    movb %r10b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %r8b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movl $64, %r8d
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT:    tileloadd (%r10,%r8), %tmm0
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT:    tileloadd (%r10,%r8), %tmm1
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT:    tileloadd (%r10,%r8), %tmm2
-; SSE2-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    addq $64, %rdi
-; SSE2-NEXT:    tilestored %tmm0, (%rdi,%r8)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    callq memcpy at PLT
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT:    movq $buf, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq $32, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    callq memcpy at PLT
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE2-NEXT:    # kill: def $rdi killed $rax
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT:    movw %si, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %dx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    movaps %xmm15, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm14, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm13, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm12, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm11, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm10, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm9, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %ax
-; SSE2-NEXT:    movw {{[0-9]+}}(%rsp), %cx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %r9b
-; SSE2-NEXT:    movb %r9b, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg (%r8)
-; SSE2-NEXT:    movl $64, %r8d
-; SSE2-NEXT:    tileloadd (%rdi,%r8), %tmm0
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:    movq %rbp, %rsp
-; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa %rsp, 8
-; SSE2-NEXT:    tilerelease
-; SSE2-NEXT:    retq
 entry:
   %m.addr.i85 = alloca i16, align 2
   %n.addr.i86 = alloca i16, align 2
@@ -4538,22 +1000,13 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
 declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1
-
-; Function Attrs: nounwind
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) #2
-
-; Function Attrs: nounwind
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2
-
-; Function Attrs: nounwind
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #2
-
-; Function Attrs: argmemonly nofree nosync nounwind willreturn
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #3
 
-attributes #0 = { noinline nounwind optnone uwtable }
+attributes #0 = { noinline nounwind optnone }
 attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly }
 attributes #2 = { nounwind }
 attributes #3 = { argmemonly nofree nosync nounwind willreturn }

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
index 0771d93e1a684..38c01f2f46cce 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
@@ -1,20 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
 
 @buf = dso_local global [1024 x i8] zeroinitializer, align 16
 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
 
-; Function Attrs: nounwind uwtable
-define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
+define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) nounwind {
 ; AVX512-LABEL: test_api:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    .cfi_def_cfa_offset 16
-; AVX512-NEXT:    .cfi_offset %rbp, -16
 ; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    .cfi_def_cfa_register %rbp
 ; AVX512-NEXT:    andq $-1024, %rsp # imm = 0xFC00
 ; AVX512-NEXT:    subq $6144, %rsp # imm = 0x1800
 ; AVX512-NEXT:    movw %dx, %ax
@@ -149,335 +143,9 @@ define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) l
 ; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
 ; AVX512-NEXT:    movq %rbp, %rsp
 ; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    .cfi_def_cfa %rsp, 8
 ; AVX512-NEXT:    tilerelease
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
-;
-; AVX2-LABEL: test_api:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    .cfi_def_cfa_offset 16
-; AVX2-NEXT:    .cfi_offset %rbp, -16
-; AVX2-NEXT:    movq %rsp, %rbp
-; AVX2-NEXT:    .cfi_def_cfa_register %rbp
-; AVX2-NEXT:    andq $-1024, %rsp # imm = 0xFC00
-; AVX2-NEXT:    subq $6144, %rsp # imm = 0x1800
-; AVX2-NEXT:    movw %dx, %ax
-; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT:    movw %si, %ax
-; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    cmpl $0, %edi
-; AVX2-NEXT:    je .LBB0_2
-; AVX2-NEXT:  # %bb.1: # %if.then
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %sil
-; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl $buf, %r9d
-; AVX2-NEXT:    movl $32, %r10d
-; AVX2-NEXT:    movw $8, %si
-; AVX2-NEXT:    tileloadd (%r9,%r10), %tmm0
-; AVX2-NEXT:    movl $64, %r8d
-; AVX2-NEXT:    tilestored %tmm0, (%r11,%r8)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    tileloadd (%r9,%r10), %tmm0
-; AVX2-NEXT:    tilestored %tmm0, (%rdi,%r8)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %dil
-; AVX2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg (%rsi)
-; AVX2-NEXT:    movl $buf, %esi
-; AVX2-NEXT:    movl $32, %edi
-; AVX2-NEXT:    tileloadd (%rsi,%rdi), %tmm0
-; AVX2-NEXT:    movl $64, %esi
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:    jmp .LBB0_3
-; AVX2-NEXT:  .LBB0_2: # %if.else
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %sil
-; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl $buf2, %r9d
-; AVX2-NEXT:    movl $32, %r10d
-; AVX2-NEXT:    movw $8, %si
-; AVX2-NEXT:    tileloadd (%r9,%r10), %tmm0
-; AVX2-NEXT:    movl $64, %r8d
-; AVX2-NEXT:    tilestored %tmm0, (%r11,%r8)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    tileloadd (%r9,%r10), %tmm0
-; AVX2-NEXT:    tilestored %tmm0, (%rdi,%r8)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %dil
-; AVX2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg (%rsi)
-; AVX2-NEXT:    movl $buf2, %esi
-; AVX2-NEXT:    movl $32, %edi
-; AVX2-NEXT:    tileloadd (%rsi,%rdi), %tmm0
-; AVX2-NEXT:    movl $64, %esi
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:  .LBB0_3: # %if.end
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %sil
-; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl $64, %esi
-; AVX2-NEXT:    movw $8, %di
-; AVX2-NEXT:    tileloadd (%r10,%rsi), %tmm1
-; AVX2-NEXT:    tileloadd (%r9,%rsi), %tmm2
-; AVX2-NEXT:    tileloadd (%r8,%rsi), %tmm0
-; AVX2-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movb %al, %dil
-; AVX2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    ldtilecfg (%rsi)
-; AVX2-NEXT:    movl $64, %esi
-; AVX2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; AVX2-NEXT:    movl $buf, %edx
-; AVX2-NEXT:    movl $32, %esi
-; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; AVX2-NEXT:    movq %rbp, %rsp
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    .cfi_def_cfa %rsp, 8
-; AVX2-NEXT:    tilerelease
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; SSE2-LABEL: test_api:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
-; SSE2-NEXT:    .cfi_offset %rbp, -16
-; SSE2-NEXT:    movq %rsp, %rbp
-; SSE2-NEXT:    .cfi_def_cfa_register %rbp
-; SSE2-NEXT:    andq $-1024, %rsp # imm = 0xFC00
-; SSE2-NEXT:    subq $6144, %rsp # imm = 0x1800
-; SSE2-NEXT:    movw %dx, %ax
-; SSE2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT:    movw %si, %ax
-; SSE2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    cmpl $0, %edi
-; SSE2-NEXT:    je .LBB0_2
-; SSE2-NEXT:  # %bb.1: # %if.then
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %sil
-; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movl $buf, %r9d
-; SSE2-NEXT:    movl $32, %r10d
-; SSE2-NEXT:    movw $8, %si
-; SSE2-NEXT:    tileloadd (%r9,%r10), %tmm0
-; SSE2-NEXT:    movl $64, %r8d
-; SSE2-NEXT:    tilestored %tmm0, (%r11,%r8)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    tileloadd (%r9,%r10), %tmm0
-; SSE2-NEXT:    tilestored %tmm0, (%rdi,%r8)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %dil
-; SSE2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg (%rsi)
-; SSE2-NEXT:    movl $buf, %esi
-; SSE2-NEXT:    movl $32, %edi
-; SSE2-NEXT:    tileloadd (%rsi,%rdi), %tmm0
-; SSE2-NEXT:    movl $64, %esi
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:    jmp .LBB0_3
-; SSE2-NEXT:  .LBB0_2: # %if.else
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %sil
-; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movl $buf2, %r9d
-; SSE2-NEXT:    movl $32, %r10d
-; SSE2-NEXT:    movw $8, %si
-; SSE2-NEXT:    tileloadd (%r9,%r10), %tmm0
-; SSE2-NEXT:    movl $64, %r8d
-; SSE2-NEXT:    tilestored %tmm0, (%r11,%r8)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    tileloadd (%r9,%r10), %tmm0
-; SSE2-NEXT:    tilestored %tmm0, (%rdi,%r8)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %dil
-; SSE2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg (%rsi)
-; SSE2-NEXT:    movl $buf2, %esi
-; SSE2-NEXT:    movl $32, %edi
-; SSE2-NEXT:    tileloadd (%rsi,%rdi), %tmm0
-; SSE2-NEXT:    movl $64, %esi
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:  .LBB0_3: # %if.end
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
-; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %sil
-; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movl $64, %esi
-; SSE2-NEXT:    movw $8, %di
-; SSE2-NEXT:    tileloadd (%r10,%rsi), %tmm1
-; SSE2-NEXT:    tileloadd (%r9,%rsi), %tmm2
-; SSE2-NEXT:    tileloadd (%r8,%rsi), %tmm0
-; SSE2-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb %al, %dil
-; SSE2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
-; SSE2-NEXT:    ldtilecfg (%rsi)
-; SSE2-NEXT:    movl $64, %esi
-; SSE2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
-; SSE2-NEXT:    movl $buf, %edx
-; SSE2-NEXT:    movl $32, %esi
-; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
-; SSE2-NEXT:    movq %rbp, %rsp
-; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    .cfi_def_cfa %rsp, 8
-; SSE2-NEXT:    tilerelease
-; SSE2-NEXT:    retq
 entry:
   %tobool.not = icmp eq i32 %cond, 0
   br i1 %tobool.not, label %if.else, label %if.then
@@ -503,11 +171,6 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-; Function Attrs: nounwind
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
-
-; Function Attrs: nounwind
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-
-; Function Attrs: nounwind
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)

diff  --git a/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll b/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll
new file mode 100644
index 0000000000000..a76a1add0676a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-zero-config.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -O0 | FileCheck %s --check-prefix=AVX512-O0
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 -O0 | FileCheck %s --check-prefix=AVX2-O0
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -O0 | FileCheck %s --check-prefix=SSE2-O0
+
+define void @foo(i8 *%buf) nounwind {
+; AVX512-LABEL: foo:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movw $32, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movw $32, %ax
+; AVX512-NEXT:    movw $8, %cx
+; AVX512-NEXT:    tilezero %tmm0
+; AVX512-NEXT:    movl $1024, %edx # imm = 0x400
+; AVX512-NEXT:    tilestored %tmm0, (%rdi,%rdx)
+; AVX512-NEXT:    tilerelease
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX2-LABEL: foo:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movw $32, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movw $32, %ax
+; AVX2-NEXT:    movw $8, %cx
+; AVX2-NEXT:    tilezero %tmm0
+; AVX2-NEXT:    movl $1024, %edx # imm = 0x400
+; AVX2-NEXT:    tilestored %tmm0, (%rdi,%rdx)
+; AVX2-NEXT:    tilerelease
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; SSE2-LABEL: foo:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movw $32, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movw $32, %ax
+; SSE2-NEXT:    movw $8, %cx
+; SSE2-NEXT:    tilezero %tmm0
+; SSE2-NEXT:    movl $1024, %edx # imm = 0x400
+; SSE2-NEXT:    tilestored %tmm0, (%rdi,%rdx)
+; SSE2-NEXT:    tilerelease
+; SSE2-NEXT:    retq
+;
+; AVX512-O0-LABEL: foo:
+; AVX512-O0:       # %bb.0: # %entry
+; AVX512-O0-NEXT:    pushq %rbp
+; AVX512-O0-NEXT:    movq %rsp, %rbp
+; AVX512-O0-NEXT:    andq $-1024, %rsp # imm = 0xFC00
+; AVX512-O0-NEXT:    subq $2048, %rsp # imm = 0x800
+; AVX512-O0-NEXT:    movq %rsp, %rdx
+; AVX512-O0-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-O0-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT:    movw $32, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT:    movw $32, %cx
+; AVX512-O0-NEXT:    movw $8, %ax
+; AVX512-O0-NEXT:    tilezero %tmm0
+; AVX512-O0-NEXT:    movl $64, %esi
+; AVX512-O0-NEXT:    tilestored %tmm0, (%rdx,%rsi)
+; AVX512-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; AVX512-O0-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT:    movw $8, %cx
+; AVX512-O0-NEXT:    # kill: def $cl killed $cl killed $cx
+; AVX512-O0-NEXT:    movb %cl, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT:    movw $32, {{[0-9]+}}(%rsp)
+; AVX512-O0-NEXT:    ldtilecfg (%rax)
+; AVX512-O0-NEXT:    movl $64, %esi
+; AVX512-O0-NEXT:    movw $32, %cx
+; AVX512-O0-NEXT:    movw $8, %ax
+; AVX512-O0-NEXT:    tileloadd (%rdx,%rsi), %tmm0
+; AVX512-O0-NEXT:    movl $1024, %edx # imm = 0x400
+; AVX512-O0-NEXT:    movw $32, %cx
+; AVX512-O0-NEXT:    movw $8, %ax
+; AVX512-O0-NEXT:    tilestored %tmm0, (%rdi,%rdx)
+; AVX512-O0-NEXT:    movq %rbp, %rsp
+; AVX512-O0-NEXT:    popq %rbp
+; AVX512-O0-NEXT:    tilerelease
+; AVX512-O0-NEXT:    vzeroupper
+; AVX512-O0-NEXT:    retq
+;
+; AVX2-O0-LABEL: foo:
+; AVX2-O0:       # %bb.0: # %entry
+; AVX2-O0-NEXT:    pushq %rbp
+; AVX2-O0-NEXT:    movq %rsp, %rbp
+; AVX2-O0-NEXT:    andq $-1024, %rsp # imm = 0xFC00
+; AVX2-O0-NEXT:    subq $2048, %rsp # imm = 0x800
+; AVX2-O0-NEXT:    movq %rsp, %rdx
+; AVX2-O0-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-O0-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    movw $32, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    movw $32, %cx
+; AVX2-O0-NEXT:    movw $8, %ax
+; AVX2-O0-NEXT:    tilezero %tmm0
+; AVX2-O0-NEXT:    movl $64, %esi
+; AVX2-O0-NEXT:    tilestored %tmm0, (%rdx,%rsi)
+; AVX2-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; AVX2-O0-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    movw $8, %cx
+; AVX2-O0-NEXT:    # kill: def $cl killed $cl killed $cx
+; AVX2-O0-NEXT:    movb %cl, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    movw $32, {{[0-9]+}}(%rsp)
+; AVX2-O0-NEXT:    ldtilecfg (%rax)
+; AVX2-O0-NEXT:    movl $64, %esi
+; AVX2-O0-NEXT:    movw $32, %cx
+; AVX2-O0-NEXT:    movw $8, %ax
+; AVX2-O0-NEXT:    tileloadd (%rdx,%rsi), %tmm0
+; AVX2-O0-NEXT:    movl $1024, %edx # imm = 0x400
+; AVX2-O0-NEXT:    movw $32, %cx
+; AVX2-O0-NEXT:    movw $8, %ax
+; AVX2-O0-NEXT:    tilestored %tmm0, (%rdi,%rdx)
+; AVX2-O0-NEXT:    movq %rbp, %rsp
+; AVX2-O0-NEXT:    popq %rbp
+; AVX2-O0-NEXT:    tilerelease
+; AVX2-O0-NEXT:    vzeroupper
+; AVX2-O0-NEXT:    retq
+;
+; SSE2-O0-LABEL: foo:
+; SSE2-O0:       # %bb.0: # %entry
+; SSE2-O0-NEXT:    pushq %rbp
+; SSE2-O0-NEXT:    movq %rsp, %rbp
+; SSE2-O0-NEXT:    andq $-1024, %rsp # imm = 0xFC00
+; SSE2-O0-NEXT:    subq $2048, %rsp # imm = 0x800
+; SSE2-O0-NEXT:    movq %rsp, %rdx
+; SSE2-O0-NEXT:    xorps %xmm0, %xmm0
+; SSE2-O0-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movw $32, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movw $32, %cx
+; SSE2-O0-NEXT:    movw $8, %ax
+; SSE2-O0-NEXT:    tilezero %tmm0
+; SSE2-O0-NEXT:    movl $64, %esi
+; SSE2-O0-NEXT:    tilestored %tmm0, (%rdx,%rsi)
+; SSE2-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; SSE2-O0-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movw $8, %cx
+; SSE2-O0-NEXT:    # kill: def $cl killed $cl killed $cx
+; SSE2-O0-NEXT:    movb %cl, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    movw $32, {{[0-9]+}}(%rsp)
+; SSE2-O0-NEXT:    ldtilecfg (%rax)
+; SSE2-O0-NEXT:    movl $64, %esi
+; SSE2-O0-NEXT:    movw $32, %cx
+; SSE2-O0-NEXT:    movw $8, %ax
+; SSE2-O0-NEXT:    tileloadd (%rdx,%rsi), %tmm0
+; SSE2-O0-NEXT:    movl $1024, %edx # imm = 0x400
+; SSE2-O0-NEXT:    movw $32, %cx
+; SSE2-O0-NEXT:    movw $8, %ax
+; SSE2-O0-NEXT:    tilestored %tmm0, (%rdi,%rdx)
+; SSE2-O0-NEXT:    movq %rbp, %rsp
+; SSE2-O0-NEXT:    popq %rbp
+; SSE2-O0-NEXT:    tilerelease
+; SSE2-O0-NEXT:    retq
+entry:
+  %t = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
+  call void @llvm.x86.tilestored64.internal(i16 8, i16 32, i8* %buf, i64 1024, x86_amx %t)
+  ret void
+}
+
+declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)


        


More information about the llvm-commits mailing list